网络知识 娱乐 mooc机器学习第三天- 聚类dbscan算法

mooc机器学习第三天- 聚类dbscan算法

考试周终于结束继续来学mooc~~

1.介绍


2.代码

import numpy as npnimport sklearn.cluster as skcnfrom sklearn import metrics #距离计算方式nimport matplotlib.pyplot as pltn nmac2id = dict()nonlinetimes = []nf = open('/Users/helong/PycharmProjects/untitled1/'n 'study/machine_learning/聚类/学生月上网时间分布-TestData.txt', encoding='utf-8')nfor line in f:n mac = line.split(',')[2]n onlinetime = int(line.split(',')[6])n starttime = int(line.split(',')[4].split(' ')[1].split(':')[0])n if mac not in mac2id:n mac2id[mac] = len(onlinetimes)#每存入一个mac且完成计数n onlinetimes.append((starttime, onlinetime))n else:n onlinetimes[mac2id[mac]] = [(starttime, onlinetime)]nreal_X = np.array(onlinetimes).reshape((-1, 2))#二维,2个element一行n# print(real_X)nX = real_X[:, 0:1]#取出开始时间且以reshape((-1,2))的组成形式([n]取一个数,[n:m]取的是一个维度形式)n# print(X)#因为使用的是曼哈顿算法,所以X必须是二维的点n ndb = skc.DBSCAN(eps=0.01, min_samples=20).fit(X)#eps核心点半径,min_samples簇的样本数nlabels = db.labels_n nprint('Labels:')nprint(labels)nprint("*******")n# print(labels[:]==-1)#标签==-1的噪声数据作为条件nraito = len(labels[labels[:] == -1]) / len(labels)#噪声比例计算nprint('Noise raito:', format(raito, '.2%'))n nn_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)#计算簇的个数n nprint('Estimated number of clusters: %d' % n_clusters_)nprint("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))n#打印个簇的标及簇内的数据nfor i in range(n_clusters_):n print('Cluster ', i, ':')n nplt.hist(X)nplt.show()


3.输出

Labels:n[ 0 -1 0 1 -1 1 0 1 2 -1 1 0 1 1 3 -1 -1 3 -1 1 1 -1 1 3 4n-1 1 1 2 0 2 2 -1 0 1 0 0 0 1 3 -1 0 1 1 0 0 2 -1 1 3n1 -1 3 -1 3 0 1 1 2 3 3 -1 -1 -1 0 1 2 1 -1 3 1 1 2 3 0n1 -1 2 0 0 3 2 0 1 -1 1 3 -1 4 2 -1 -1 0 -1 3 -1 0 2 1 -1n-1 2 1 1 2 0 2 1 1 3 3 0 1 2 0 1 0 -1 1 1 3 -1 2 1 3n1 1 1 2 -1 5 -1 1 3 -1 0 1 0 0 1 -1 -1 -1 2 2 0 1 1 3 0n0 0 1 4 4 -1 -1 -1 -1 4 -1 4 4 -1 4 -1 1 2 2 3 0 1 0 -1 1n0 0 1 -1 -1 0 2 1 0 2 -1 1 1 -1 -1 0 1 1 -1 3 1 1 -1 1 1n0 0 -1 0 -1 0 0 2 -1 1 -1 1 0 -1 2 1 3 1 1 -1 1 0 0 -1 0n0 3 2 0 0 5 -1 3 2 -1 5 4 4 4 -1 5 5 -1 4 0 4 4 4 5 4n4 5 5 0 5 4 -1 4 5 5 5 1 5 5 0 5 4 4 -1 4 4 5 4 0 5n4 -1 0 5 5 5 -1 4 5 5 5 5 4 4]n*******nNoise raito: 22.15%nEstimated number of clusters: 6nSilhouette Coefficient: 0.710nCluster 0 :n[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]nCluster 1 :n[23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23]nCluster 2 :n[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]nCluster 3 :n[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21]nCluster 4 :n[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]nCluster 5 :n[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]

# flatten()函数用法n#n# flatten是numpy.ndarray.flatten的一个函数,即返回一个一维数组。n#n# flatten只能适用于numpy对象,即array或者mat,普通的list列表不适用!。n#n# a.flatten():a是个数组,a.flatten()n# 就是把a降到一维,默认是按行的方向降 。n# a.flatten().A:a是个矩阵,降维后还是个矩阵,矩阵.A(等效于矩阵.getA())变成了数组。