实例:文本词频统计
实例:文本词频统计
英文文本:hamlet,统计出现最多的英文单词
代码实现:
#Hamlet 词频统计
def getText():
txt = open(“hamlet”,‘r’).read()
txt = txt.lower() #大写字母转换小写
for word in ‘~!@ #$%^&*()_+-={}[],./:";<>?’:
txt = txt.replace(word," “)# 把多余符号转换为空格
return txt
hamletTxt = getText()
words = hamletTxt.split() #以空格拆分为列表
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1 #以每个词为键,值默认 0,, 每出现一次累加 1
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)#[1] 按照第二维排序,reverse=True 表示降序
for i in range(10):
word,count = items[i]
print(”{0:<10}{1:5}".format(word,count))
中文文本:三国,分析人物
https://python123.io/resources/pye/threekingdoms.txt
import jieba
txt = open(“Threekingdoms”, ‘r’, encoding=“utf-8”).read()
excludes = {‘将军’,‘却说’,‘荆州’,‘二人’,‘不可’,‘不能’,‘如此’}
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
#书中同一人物多个名字统一改为一个名字
elif word == ‘诸葛亮’ or word == ‘空明日’:
rword = “孔明”
elif word == ‘关公’ or word == ‘云长’:
rword = “关羽”
elif word == ‘玄德’ or word == ‘玄德日’:
rword = “刘备”
elif word == ‘孟德’ or word == ‘丞相’:
rword = “曹操”
else:
rword = word
counts[word] = counts.get(word, 0) + 1
for word in excludes:
del counts[word] #去重
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(10):
word,count = items[i]
print(“{0:<10}{1:>5}”.format(word,count))