"python 词向量训练以及聚类 #!/usr/bin/env Python3 __author__ = '未昔/angelfate' __date__ = '2019/8/14 17:06' # -*- coding: utf-8 - ...."

wangwei

机器学习, 数据采集, 数据分析, web网站，IS-RPA，APP
其他经验社区动态 word2Vec 词向量聚类 • 4 回帖 • 1.1K 浏览 • 2019-08-15 12:21:23

机器学习篇 - python 词向量训练 + 聚类

python 词向量训练以及聚类

#!/usr/bin/env Python3
__author__ = '未昔/angelfate'
__date__ = '2019/8/14 17:06'
# -*- coding: utf-8 -*-
import pandas as pd
import jieba,re,os
from gensim.models import word2vec
import logging


class Word2Vec_Test(object):
    def __init__(self):
        self.csv_path = 'DouBanFilm_FanTanFengBao4.csv'
        self.txt_path = 'comment.txt'

		```
##  `首先提取 csv的 评论列内容，到txt`

`1、读取txt评论内容`


    def read_file(self):
        """
        训练模型
        :return:
        """
        # jieba.load_userdict(self.txt_path)

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
                            filename='test_01.log')
        filename = self.txt_path  # 测试文本
        pre, ext = os.path.splitext(filename)  # 输入文件分开前缀，后缀   pre=test_01   ext=.txt
        corpus = pre + '_seg' + ext  
        fin = open(filename, encoding='utf8').read().strip(' ').strip('\n').replace('\n\n',
                                                                                    '\n')  # strip()取出首位空格，和换行符，用\n替换\n\n
        stopwords = set(open('北京大学停用词.txt', encoding='gbk').read().strip('\n').split('\n'))  # 读入停用词

`2、分词，将训练文本中的词做处理，不能包含停用词中的词，以及长度少于等于1的词，去标点`

 text = ' '.join(
            [x for x in jieba.lcut(fin) if x not in stopwords and len(x) > 1 and x != '\n'])  # 去掉停用词中的词，去掉长度小于等于1的词
        print(text)
        results = re.sub('[（）：:？“”《》，。！·、\d ]+', ' ', text)  # 去标点
        open(corpus, 'w+', encoding='utf8').write(results)  # 按行分词后存为训练语

`3、训练模型`

  sentences = word2vec.LineSentence(corpus)
        # sentences1 = word2vec.Text8Corpus(corpus)  #用来处理按文本分词语料
        # print('=--=-=-=-=-=',sentences)
        model = word2vec.Word2Vec(sentences, size=12, window=25, min_count=2, workers=5, sg=1,
                                  hs=1)

`4、保存模型`

model.save("test_01.model")  # 保存模型
model.wv.save_word2vec_format('test_01.model.txt', 'test_01.vocab.txt',
				 binary=False)

`5、加载模型，验证模型`

def yan(self):
	"""
	第五：加载模型，验证模型
	:return:
	"""
	
	# 5词向量验证
	# 加载训练好的模型
	model = word2vec.Word2Vec.load("test_01.model")  
	role1 = ['反贪', 'ICAC', '廉政']
			role2 = ['古天乐', '电影']
			pairs = [(x, y) for x in role1 for y in role2]

	print(pairs)

	for pair in pairs:
		print("-- [%s]和[%s]的相似度为：" % (pair[0], pair[1]), model.similarity(pair[0], pair[1]))  # 预测相似性

`个词的相关词列表`

figures = ['反贪', 'ICAC', '廉政', '古天乐', '电影', '警察', '廉政公署', '香港']
for figure in figures:
	print("> 和[%s]最相关的词有：\n" % figure,
		  '\n'.join([x[0].ljust(4, '　') + str(x[1]) for x in model.most_similar(figure, topn=10)]),
		  sep='')

结果

C:\Python\python.exe E:/python/Study/机器学习/词向量/TEST1/test.py
[('反贪', '古天乐'), ('反贪', '电影'), ('ICAC', '古天乐'), ('ICAC', '电影'), ('廉政', '古天乐'), ('廉政', '电影')]
-- [反贪]和[古天乐]的相似度为： 0.8193734
-- [反贪]和[电影]的相似度为： 0.79485226
-- [ICAC]和[古天乐]的相似度为： 0.67751527
-- [ICAC]和[电影]的相似度为： 0.56181175
-- [廉政]和[古天乐]的相似度为： 0.87820566
-- [廉政]和[电影]的相似度为： 0.7926007
> 和[反贪]最相关的词有：
好看　　0.9580947160720825
不行　　0.9477688074111938
小时候　0.9470022320747375
热闹　　0.9458567500114441
插入　　0.9457867741584778
爱好者　0.9425768256187439
出戏　　0.9419222474098206
痕迹　　0.9414110779762268
算是　　0.9400239586830139
片子　　0.9387757182121277
> 和[ICAC]最相关的词有：
惩教署　0.9477450251579285
官员　　0.9427414536476135
奸帅　　0.9290784597396851
二代　　0.926984429359436
故事　　0.9213033318519592
退一步　0.921079695224762
讽刺　　0.9191040396690369
坠机　　0.9189454317092896
斗狠　　0.9186890125274658
无疑　　0.9165976047515869
> 和[廉政]最相关的词有：
牵强　　0.9715588092803955
今天　　0.9601073861122131
傻傻　　0.9586002826690674
路上　　0.9573702216148376
一步　　0.9561094045639038
风雨　　0.9548968076705933
对比　　0.9547584056854248
套餐　　0.9538712501525879
爱好者　0.9530250430107117
熟悉　　0.9524191617965698
> 和[古天乐]最相关的词有：
依旧　　0.9562309384346008
想起　　0.9511849284172058
熟悉　　0.9502787590026855
年轻　　0.9482583999633789
再次　　0.9425556659698486
张智霖　0.9382885694503784
一种　　0.9350777864456177
IP　　0.9331182241439819
草率　　0.9291275143623352
西装　　0.9291061162948608
> 和[电影]最相关的词有：
看着　　0.9870060682296753
哈哈哈　0.9635094404220581
质感　　0.9586584568023682
加油　　0.9530031681060791
仓促　　0.9522775411605835
院线　　0.9515659809112549
昔日　　0.950602650642395
套路　　0.9499426484107971
紧张　　0.9478718042373657
不合理　0.9468604326248169
> 和[警察]最相关的词有：
颜值　　0.9682283401489258
程度　　0.9645314812660217
第三集　0.9635794162750244
没人　　0.9632000923156738
黑古　　0.9607852101325989
致敬　　0.9602598547935486
破坏　　0.9601216316223145
回归　　0.9591646790504456
多年　　0.958872377872467
一堆　　0.9572871923446655
> 和[廉政公署]最相关的词有：
可能　　0.9681879281997681
还会　　0.9589521884918213
bug　0.9557142853736877
看到　　0.9497145414352417
质感　　0.9490318298339844
酱油　　0.9447331428527832
服气　　0.9417837858200073
感谢　　0.940988302230835
为啥　　0.9371879696846008
面前　　0.9368493556976318
> 和[香港]最相关的词有：
正面　　0.966416597366333
惊险　　0.9605911374092102
电影版　0.9520364999771118
影城　　0.9459754228591919
场面　　0.9448919296264648
粤语　　0.9435780048370361
先系　　0.9433020949363708
案件　　0.9400972127914429
警匪　　0.9388967156410217
上映　　0.9355912804603577
None

Process finished with exit code 0