php是最好的语言

python3利用k-means算法实现聚类

聚类就是把一批数据做一个分类

#这里有对应函数参数的详细讲解( 
# 原理讲解 https://blog.csdn.net/mrwu9902/article/details/53672514
import pandas as pd
import numpy as npy
from sklearn.cluster import KMeans,MiniBatchKMeans
from sklearn.preprocessing import LabelEncoder
import sys

attr_arr=[['slashdot','USA','yes',18,'None'],
         ['google','France','yes',23,'Premium'],
         ['digg','USA','yes',24,'Basic'],
         ['kiwitobes','France','yes',23,'Basic'],
         ['google','UK','no',21,'Premium'],
         ['(direct)','New Zealand','no',12,'None'],
         ['(direct)','UK','no',21,'Basic'],
         ['google','USA','no',24,'Premium'],
         ['slashdot','France','yes',19,'None'],
         ['digg','USA','no',18,'None'],
         ['google','UK','no',18,'None'],
         ['kiwitobes','UK','no',19,'None'],
         ['digg','New Zealand','yes',12,'Basic'],
         ['slashdot','UK','no',21,'None'],
         ['google','UK','yes',18,'Basic'],
         ['kiwitobes','France','yes',19,'Basic']]

#生成属性数据集和结果数据集
dataMat = npy.array(attr_arr)
attr_pd = dataMat[:,0:4]
attr_names = ['src', 'address', 'FAQ', 'num']   #特征属性的名称
attr_pd = pd.DataFrame(data=attr_pd,columns=attr_names)    #每行为一个对象,每列为一种属性,最后一个为结果值

le = LabelEncoder()
for col in attr_pd.columns:                                            #为每一列序列化,就是将每种字符串转化为对应的数字。用数字代表类别
    attr_pd[col] = le.fit_transform(attr_pd[col])

kms = KMeans(n_clusters = 3)
y = kms.fit_predict(attr_pd)
print(y)

#在大量数据的时候考虑用MiniBatchKMeans
mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=45,
                      n_init=10, max_no_improvement=10, verbose=0)
y = mbk.fit_predict(attr_pd)
print(y)

#结果打印如下
#[0 1 2 1 2 0 2 2 1 0 0 0 0 1 0 1]
#[1 2 0 2 0 1 0 0 2 1 1 1 1 2 1 2]


作者:xTao 分类:LNMP 浏览:2354 评论:0