python3利用k-means算法实现聚类
聚类就是把一批数据做一个分类
#这里有对应函数参数的详细讲解( # 原理讲解 https://blog.csdn.net/mrwu9902/article/details/53672514 import pandas as pd import numpy as npy from sklearn.cluster import KMeans,MiniBatchKMeans from sklearn.preprocessing import LabelEncoder import sys attr_arr=[['slashdot','USA','yes',18,'None'], ['google','France','yes',23,'Premium'], ['digg','USA','yes',24,'Basic'], ['kiwitobes','France','yes',23,'Basic'], ['google','UK','no',21,'Premium'], ['(direct)','New Zealand','no',12,'None'], ['(direct)','UK','no',21,'Basic'], ['google','USA','no',24,'Premium'], ['slashdot','France','yes',19,'None'], ['digg','USA','no',18,'None'], ['google','UK','no',18,'None'], ['kiwitobes','UK','no',19,'None'], ['digg','New Zealand','yes',12,'Basic'], ['slashdot','UK','no',21,'None'], ['google','UK','yes',18,'Basic'], ['kiwitobes','France','yes',19,'Basic']] #生成属性数据集和结果数据集 dataMat = npy.array(attr_arr) attr_pd = dataMat[:,0:4] attr_names = ['src', 'address', 'FAQ', 'num'] #特征属性的名称 attr_pd = pd.DataFrame(data=attr_pd,columns=attr_names) #每行为一个对象,每列为一种属性,最后一个为结果值 le = LabelEncoder() for col in attr_pd.columns: #为每一列序列化,就是将每种字符串转化为对应的数字。用数字代表类别 attr_pd[col] = le.fit_transform(attr_pd[col]) kms = KMeans(n_clusters = 3) y = kms.fit_predict(attr_pd) print(y) #在大量数据的时候考虑用MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=45, n_init=10, max_no_improvement=10, verbose=0) y = mbk.fit_predict(attr_pd) print(y) #结果打印如下 #[0 1 2 1 2 0 2 2 1 0 0 0 0 1 0 1] #[1 2 0 2 0 1 0 0 2 1 1 1 1 2 1 2]