当前位置：网站首页>Multivariate cluster analysis

Multivariate cluster analysis

2022-07-06 09:04:00 【Also far away】

One 、 Code

import pandas as pd
from pandas import DataFrame
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

#  Read the file 
datafile = u'student-mat.xlsx' #  File location ,u To prevent the path from having Chinese names , There is no , It can be omitted 
outfile = 'stu.xlsx'
data = pd.read_excel(datafile)  # datafile yes excel file , So use read_excel, If it is csv For documents read_csv
d = DataFrame(data)

#  clustering 
n = 5                         #  Coalescence  5  Class data 
mod = KMeans(n_clusters=n)
mod.fit_predict(d)  # y_pred Represents the result of clustering 

#  Coalescence  5  Class data , Count the amount of data under each cluster , And find their center 
r1 = pd.Series(mod.labels_).value_counts()  #  How many samples are there under each class 
r2 = pd.DataFrame(mod.cluster_centers_)     #  center  
r = pd.concat([r2, r1], axis=1)
r.columns = list(d.columns) + [u' Number of categories ']


#  Mark each piece of data with which category it is divided 
r = pd.concat([d, pd.Series(mod.labels_, index=d.index)], axis=1)
r.columns = list(d.columns) + [u' Clustering categories ']
print(r)
r.to_excel(outfile)  #  If you need to save to local , Just write this column 

#  Visualization process 

ts = TSNE()
ts.fit_transform(r)
ts = pd.DataFrame(ts.embedding_, index=r.index)

a = ts[r[u' Clustering categories '] == 0]
plt.plot(a[0], a[1], 'r.')
a = ts[r[u' Clustering categories '] == 1]
plt.plot(a[0], a[1], 'go')
a = ts[r[u' Clustering categories '] == 2]
plt.plot(a[0], a[1], 'g*')
a = ts[r[u' Clustering categories '] == 3]
plt.plot(a[0], a[1], 'b.')
a = ts[r[u' Clustering categories '] == 4]
plt.plot(a[0], a[1], 'b*')
plt.show()