电影票房数据分析 + RPA 机器学习 2- 哪吒之魔童降世 - 票房聚类
虽然结果出来了,但是,由于数据太少,看不出什么。所以,我们把机器学习的重点,放在评论上面
聚类三个范围
#!/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @Software: PyCharm
# @virtualenv:workon
# @contact: contact information
# @Desc:Code descripton
__author__ = '未昔/AngelFate'
__date__ = '2019/8/24 22:53'
k = 3
iteration = 500
model = KMeans(n_clusters=k, n_jobs=1, max_iter=iteration)
y = model.fit_predict(x)
label_pred = model.labels_
centroids = model.cluster_centers_ #获取聚类中心
inertia = model.inertia_
print('y:\n',y)
print('label_pred:\n',label_pred)
print('centroids:\n',centroids)
print('inertia:\n',inertia)
print('----分类结果----:')
result = list(zip(y, x))
for i in result:
print(i)
# 简单打印结果
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.DataFrame(model.cluster_centers_) # 将二维数组格式的cluster_centers_转换为DataFrame格式
print('r2: \n', r2)
r = pd.concat([r2, r1], axis=1) 默认从0开始
r.columns = data2.columns.tolist() + ['类别数目'] # 重命名表头
print('r: \n', r)
output_data = pd.concat([data2, pd.Series(model.labels_, index=data2.index)], axis=1)
output_data.columns = list(data2.columns) + ['聚类类别'] # 重命名表头
# output_data.to_excel(output_path) # 保存结果
# 使用TSNE进行数据降维并展示聚类结果
tsne = TSNE()
tsne.fit_transform(data2)
# tsne.embedding_可以获得降维后的数据
print('tsne.embedding_: \n', tsne.embedding_)
tsn = pd.DataFrame(tsne.embedding_, index=data.index)
print('tsne: \n', tsne)
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
color_style = ['r.', 'go', 'b*']
for i in range(k):
d = tsn[output_data[u'聚类类别'] == i]
plt.plot(d[0], d[1], color_style[i], label='聚类' + str(i+1))
plt.legend()
plt.show()
聚类结果
聚类结果
----分类结果----:
(2, array([2.01000e+04, 1.28361e+05, 3.00000e+01, 2.21753e+04]))
(1, array([2.301119e+04, 1.567950e+05, 3.900000e+01, 3.459470e+04]))
(1, array([2.867732e+04, 1.667870e+05, 4.700000e+01, 4.088600e+04]))
(1, array([1.88020e+04, 1.69018e+05, 3.10000e+01, 2.33768e+04]))
(1, array([1.934163e+04, 1.783780e+05, 3.000000e+01, 2.363760e+04]))
(1, array([1.964646e+04, 1.846340e+05, 3.000000e+01, 2.368560e+04]))
(2, array([1.764494e+04, 1.531700e+05, 3.200000e+01, 3.188840e+04]))
(2, array([2.019723e+04, 1.453720e+05, 3.800000e+01, 3.594430e+04]))
(1, array([3.387688e+04, 1.693550e+05, 5.400000e+01, 5.230970e+04]))
(1, array([3.408296e+04, 1.795520e+05, 5.200000e+01, 5.270440e+04]))
(1, array([1.668626e+04, 1.726620e+05, 2.700000e+01, 2.785250e+04]))
(1, array([1.541501e+04, 1.727600e+05, 2.500000e+01, 2.604260e+04]))
(2, array([2.51734e+04, 1.43915e+05, 4.90000e+01, 5.66489e+04]))
(2, array([1.221956e+04, 1.446960e+05, 2.400000e+01, 2.578140e+04]))
(0, array([1.123695e+04, 1.096660e+05, 2.900000e+01, 3.040230e+04]))
(2, array([1.813015e+04, 1.400030e+05, 3.600000e+01, 3.664440e+04]))
(0, array([1.790766e+04, 1.065170e+05, 2.900000e+01, 2.748880e+04]))
(2, array([8.91428e+03, 1.42105e+05, 1.80000e+01, 1.84165e+04]))
(2, array([7.83286e+03, 1.42299e+05, 1.60000e+01, 1.63741e+04]))
(2, array([6.97618e+03, 1.41605e+05, 1.40000e+01, 1.46528e+04]))
(2, array([6.15493e+03, 1.41263e+05, 1.30000e+01, 1.31160e+04]))
(0, array([6.87670e+03, 8.48370e+04, 2.30000e+01, 2.33302e+04]))
(0, array([1.108646e+04, 1.065170e+05, 2.900000e+01, 2.748880e+04]))
(0, array([1.089267e+04, 1.117420e+05, 2.800000e+01, 2.562580e+04]))
(0, array([5.43425e+03, 1.10661e+05, 1.50000e+01, 1.33031e+04]))
(0, array([5.01389e+03, 1.11588e+05, 1.40000e+01, 1.20931e+04]))
标签
y:
[2 1 1 1 1 1 2 2 1 1 1 1 2 2 0 2 0 2 2 2 2 0 0 0 0 0]
聚类中心
centroids:
[[9.77836857e+03 1.05932571e+05 2.38571429e+01 2.28188714e+04]
[2.32821900e+04 1.72215667e+05 3.72222222e+01 3.38988778e+04]
[1.43343530e+04 1.42278900e+05 2.70000000e+01 2.71642100e+04]]