当前位置：网站首页>Simple code implementation of K-means clustering

Simple code implementation of K-means clustering

2022-07-29 03:24:00 【Order anything】

Clustering algorithm is a typical Unsupervised learning algorithm , It is mainly used for Automatically classify similar samples into a category .

In the clustering algorithm, according to the similarity between samples , Divide the samples into different categories , For different similarity calculation methods , Different clustering results will be obtained , Common similarity calculation methods are euclidean distance .

Simple clustering code implementation ：

# coding:utf-8
from math import *
from random import random

import numpy as np
from numpy import *
from numpy.ma import power,mean,nonzero
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
class myOwn_k_means2(object):
    """
    k-means The whole includes content ：
    1.fit： Training function incoming x, Hyperparameters k, At the same time, take the random center point centroids
    2.choose_centroid:  Select randomly within the boundary k A center of mass 
    3.distance： Find the distance between two points 
    4.key_k_means： The core part of the , Cluster and readjust the center , Return to the center of acquisition 
    5.predict:  Forecast part , Get the final cluster classification results 
    """
    def fit(self,x,k):
        #  Get the center point according to the training data 
        self.centroids = self.choose_centroid(x,k)
        self.new_centroids = self.key_k_means(x,k)

    def choose_centroid(self,x,k):
        """
         Randomly select the center of mass , Select the value of the centroid within the maximum boundary 
         Using a 0-1 The random generator of 
        :return: Randomly selected centroid 
        """

        #  Take the number of columns , That is, the characteristic number 2
        num = np.shape(x)[1]
        #  Establish a central point matrix 
        centroids = np.mat(np.zeros((k,num)))
        #  Randomly take k Center 
        for j in range(num):
            minJ = min(x[:,j])
            # print(f' I am a minJ：\n{minJ}')
            maxJ = max(x[:,j])
            # print(f' I am a maxJ：\n{maxJ}')
            rangeJ = float(maxJ-minJ)
            #  stay 0-1 Take between k individual 
            centroids[:,j] = minJ + rangeJ*random.rand(k,1)
        # print(f' I am a centroids：\n{centroids}')
        return centroids

    def distance(self,disA,disB):
        """
         Find the distance between two points ,disA,disB Two point matrix 
        :return:  The Euclidean distance between two points 
        """
        # print(f' I am distance distance：\n{sqrt(sum(power((disA-disB),2)))}')
        return sqrt(sum(power((disA-disB),2)))

    def key_k_means(self,x,k):
        print(f' Start to calculate the centroid position ')
        """
        k-means  The core of , Cluster and readjust the center 
        :return:  Final center point and classification list 
        """
        #  Take the number of lines , Number of all points 1500
        m = np.shape(x)[0]
        #  Set up a cluster allocation result matrix , Two , A list of record cluster index values , A column records the distance to the center of mass 
        cluster = np.mat(np.zeros((m,2)))
        # 1. Call function choose_centroid(), And initialization centroids
        # centroids = self.choose_centroid(x,k)
        clusterChange = True
        while clusterChange:
            clusterChange = False
            #  Traverse all the data to find the centroid closest to each point 
            #  specific working means ： Traverse all centroids for each point , And calculate the distance from the point to each centroid .
            for i in range(m):
                minDist = 1000.0;
                minIndex = -1
                for j in range(k):
                    #  Calculate the Euclidean distance from the point to the center of mass 
                    distJI = self.distance(self.centroids[j,:],x[i,:])
                    if distJI < minDist:
                        minDist = distJI
                        minIndex = j
                #  Update the cluster to which each row of samples belongs 
                if cluster[i,0] != minIndex:
                    clusterChange = True
                    cluster[i,:] = minIndex,minDist ** 2
            # print(f' I am the center of mass ：\n{centroids}')
            #  Traverse all centroids , And update their values 
            for cent in range(k):
                #  Get all points of the cluster class , function nonzero()
                ptsInClust = x[nonzero(cluster[:,0].A == cent)[0]]
                # print(f' I am a ptsInClust：\n{ptsInClust}')
                #  Average the rows of the matrix 
                self.centroids[cent,:] = mean(ptsInClust,axis=0)
                #  Use a value to undertake new_centroids
                new_centroids = self.centroids
        # print(f' I am a centroids:\n{centroids}')
        # print(f' I am a cluster.A[:,0]:\n{cluster.A[:,0]}')
        #  I am a cluster.A[:,0]:[0. 0. 1. ... 2. 2. 2.]
        # return centroids,cluster.A[:,0]
        return new_centroids

    def predict(self,X):
        """
        k-means  The core of , Cluster and readjust the center 
        :return:
        """
        #  Read in the data to be predicted 
        # self.X = X
        #  A good training center 

        # print(f' I am a predict Medium centriods：\n{centriods}')
        #  Define a new list storage distance 
        list_distance = []
        list_result = []
        for i in range(len(self.centroids)):
            for j in range(len(X)):
                pre_distance = self.distance(self.centroids[i],X[j])
                list_distance.append(pre_distance)
            # print(f' I am a list_distance:\n{np.array(list_distance).shape}')
            list_result.append(list_distance)
            # print(f' I am a list_result:\n{np.array(list_result).shape}')
            list_distance = []
        list_result = np.array(list_result)
        # print(f' I am a list_result:\n{list_result}')
        #  Take the smallest column index of each column 
        result = np.argmin(list_result,axis=0)
        # print(f' I am a result：\n{result}')
        return result




plt.figure(figsize=(12,12))

n_samples = 1500
random_state = 170
X,y = make_blobs(n_samples=n_samples,random_state=random_state)
# print(f' I am a X：\n{X}')
my_kmeans = myOwn_k_means2()
my_kmeans.fit(X,3)
y_pred = my_kmeans.predict(X)

plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title(" I am clustering ")
plt.show()

A new function is used here ：nonzero()

原网站

版权声明
本文为[Order anything]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/196/202207130553300678.html

当前位置：网站首页>Simple code implementation of K-means clustering

Simple code implementation of K-means clustering

边栏推荐

猜你喜欢

随机推荐