吉林省住房城乡建设厅网站,苏州市建设局网站首页,WordPress搭建交互式网站,动漫网站开发一、任务目标 #xff08;1#xff09;用合适的格式读取文本数据 #xff08;2#xff09;使用jieba分词对文本数据进行分词#xff0c;并可视化分词效果 #xff08;3#xff09;设计停止词表#xff0c;对文本数据的多余部分进行删除 #xff08;4#xff09;对文本…一、任务目标 1用合适的格式读取文本数据 2使用jieba分词对文本数据进行分词并可视化分词效果 3设计停止词表对文本数据的多余部分进行删除 4对文本数据进行词云展示 5TF-IDF提取关键词 6LDA主题模型 7新闻数据分类
二、代码及效果 1导入包
#导入必须的包
import pandas as pd
import jieba
import numpy2读取文本数据
df_news pd.read_table(./data/val.txt,name[category,theme,url,content],encoding utf-8)
df_news df_news.dropna(axis0)#删除掉一行数据中有缺失项的
df_news.head()#读取前几条信息
df_news.shape #查看数据规模3使用jieba进行分词
content df_news.content.values.tolist() #将数据df_news里面的content转换成list数据方便jieba进行分词
print(content[1000]) #展示分词前的数据内容content_S[] #预设定存入分词后的数据
for line in content:current_segment jieba.lcut(line) #进行分词处理if(len(current_segment)1 and current_segment !\r\n): #加入换行符content_S.append(current_segment) #将分词结果存入content_S[1000] #展示分词效果df_content pd.DataFrame({content_S:content_S}) #可视化每一条文本数据的分词结果
df_content.head()4设计停止词表对文本数据的多余部分进行删除可视化所有词并统计次数
stopwords pd.read_csv(stopwords.txt,index_col False, sep\t,quoting3,names[stopwords])
stopwords.head(20)def drop_stopwords(contents,stopwords):contents_clean[]all_words[]for line in contents:line_clean[]for word in line:if word in stopwords:continueline_clean.append(word)all_words.append(str(word))contents_clean.append(line_clean)return contents_clean,all_wordscontents df_content.content_S.values.tolist()
stopwords stopwords.stopword.values.tolist()
contents_clean,all_words drop_stopwords(contents,stopwords)df_content pd.DataFrame({contents_cleadn:contents_clean})
df_content.headdf_all_words pd.DataFrame({all_words:all_words})
words_count df_all_words.groupby(by[all_words])[all_words].agg({count:numpy.size})
words_count words_count.reset_index().sort_values(by[count],ascending False)
words_count.head()5词云展示
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlibmatplotlib.rcParams[figure.figsize] (10.0,5.0)
wordcloud WordCloud(font_path ./data/simhei.ttf,background_colorwhite,max_front_size80)
word_frequence {x[0]: x[1] for x in words_count.head(100).values}
wordcloud wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)6TF-IDF提取关键词
import jieba.analyse
index 2000
print(df_news[content][index])
content_S_str.join(content_S[index])
print(.join(jieba.analyse.extract_tags(content_S_str,topK10,withWeightFalse)))7LDA主题模型
from gensim import corpora,models,similarities
import gensimdictionary corpora.Dictionary(contents_clean)
corpus [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda gensim.models.ldamodel.LdaModel(corpuscorpus,id2worddictionary,num_topics20)
print(lda.print_topic(1,topn5))8基于贝叶斯算法的新闻数据分析
df_train pd.DataFrame({contents_clean:contents_clean,label:df_news[category]})
df_train.tail()df_train.label.unique()
label_mapping {汽车:1,财经:2,科技:3,健康:4,体育:5,教育:6,文化:7,军事:8,娱乐:9,时尚:0}
df_train[label]df_train[label].map(label_mapping)from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test train_test_split(df_train[contents_clean].values, df_train[label].values,random_state1)words []
for line_index in range(len(x_train)):try:words.append( .join(x_train[line_index]))except:print(line_index)print(words[0])
print(len(words))from sklearn.feature_extraction.text import CountVectorizer
vec CountVectorizer(analyzer word, max_features4000, lowercaseFalse)
vec.fit(words)from sklearn.naive_bayes import MultinomialNB
classifier MultinomialNB()
classifier.fit(vec.transform(words), y_train)test_words []
for line_index in range(len(x_test)):try:test_words.append( .join(x_test[line_index]))except:print(line_index)
test_words[0]print(classifier.score(vec.transform(test_words),y_test))