import nltk from nltk.tokenize import word_tokenize import os import re
stop_words = nltk.corpus.stopwords.words('english') #stop_words调用nltk预先定义好的不需要统计的词语字典 newStopWords = ['I','It','The','one','use','this','They','Im','If','So','But','A','us','My',\ 'This','We','These','You','For','She','He','Yet','As'] #newStopWords定义不需要筛选的词语 stop_words.extend(newStopWords) #将不需要统计的词语添加到nltk的stopwords字典中 list = os.listdir('C:/Users/25496/Desktop/建模/原文') #list声明需要进行词频统计的文章的存储路径(本脚本会将该路径下所有的文档进行词频统计) print(list) for content inlist: withopen('C:/Users/25496/Desktop/建模/原文/'+content, encoding="utf-8") as f: # 读取文件中的字符串 txt = f.read() # 去除字符串中的标点、数字等 txt = re.sub('[,\.()":;!@#$%^&*\d]|\'s|\'', '', txt) # 替换换行符,大小写转换,拆分成单词列表 word_list = txt.replace('\n', ' ').replace(' ', ' ').lower().split(' ') word_tokens = word_tokenize(txt) filtered_sentence = [] for w in word_tokens: if w notin stop_words: filtered_sentence.append(w) word_count_dict = {}
for word in filtered_sentence: # 统计字典中的词频 if word in word_count_dict.keys(): word_count_dict[word] += 1 else: word_count_dict[word] = 1 # 按照单词出现次数排序 word_count_dict = sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True) # 输出到文件(如果是多个文件会输出到同一目录下) withopen("C:/Users/25496/Desktop/建模/结果/result_"+content, 'w', encoding="utf-8")as f1: for i in word_count_dict: f1.write("%s\t%s\n" % (i[0], str(i[1])))