当前位置:网站首页>Using k-means clustering to classify tariff models of different industries

Using k-means clustering to classify tariff models of different industries

2022-07-28 10:47:00 interval_ package

Here is a cluster of different industries

I have done it before VAR Model fitting , And then use VAR Variance decomposition of ,Fevd As this learning parameter

Share with you

There is reference sklearn Code of official website

Welcome comments if you want to communicate

import matplotlib.pyplot as plt

from VARModelFitting import *
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import scipy.interpolate as spi
import time


class MyIndustry(object):
    def __init__(self, Data: pd.DataFrame, name, lag=3):
        self.name = name
        self.rawDecompInfo = self.fevdIndentity(Data, lag, 10)
        self.DataJointing()
        # print(name,':\n',self.rawDecompInfo)
        pass

    @staticmethod
    def fevdIndentity(Data, lag, MaxPeriod):
        return VARFitter(Data, lag)[1].fevd(MaxPeriod).decomp

    def DataJointing(self, sepecificFevd=-1, enhanceFlag=True, enhanceCount=1000):
        if sepecificFevd < 0:
            output = []
            for item in self.rawDecompInfo:
                if isinstance(output, np.ndarray):
                    output = np.concatenate((output, item), axis=0)
                else:
                    output = item
        else:
            output = self.rawDecompInfo[sepecificFevd]
            print("output:\n", output)
            if enhanceFlag:
                output = self.DataEnhance(output, enhanceCount)
                print("enhanceFlag output:\n", output)
        return output

    @staticmethod
    def DataEnhance(data: np.ndarray, _min=-1, _max=-1, enhanceCount=50):
        if _min < 0 or _max < 0:
            _min = 0
            _max = data.shape[0]
        output = []
        rawData = data.T
        periods = np.arange(_min, _max)
        amplifiedPeriods = np.linspace(_min, _max, enhanceCount)
        for row in rawData:
            tck = spi.splrep(periods, row)
            result = spi.splev(amplifiedPeriods, tck, der=0)
            result = result.reshape(result.shape[0], 1)
            if isinstance(output, np.ndarray):
                output = np.hstack((output, result))
            else:
                output = result
        return output

    def industryIdentityDefine(self, k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0):
        inputs = self.DataJointing(sepecificFevd=sepecificFevd, enhanceFlag=False)
        #  So this is using pairwise_distances_argmin, Let's classify the clusters , great 
        k_means_labels = pairwise_distances_argmin(inputs, k_means_cluster_centers)
        mbk_means_labels = pairwise_distances_argmin(inputs, mbk_means_cluster_centers)
        k_means_result = np.zeros((n_clusters, 1))
        mbk_means_result = np.zeros((n_clusters, 1))
        for i in range(n_clusters):
            k_means_result[i] = sum(k_means_labels == i)
            mbk_means_result[i] = sum(mbk_means_labels == i)
        return k_means_result, mbk_means_result


def industryIdentityShow(k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0):
    plt.figure("industryIdentityShow", figsize=(20, 16))
    Data_base, classNames = GetClasses()
    for name, idx in zip(classNames, range(0, len(classNames))):
        Data = ReadTariffData(Data_base, name)
        Data = ProcessBaseData(Data)
        name = name.strip()
        try:
            k_means_result, mbk_means_result = \
                MyIndustry(Data, name).industryIdentityDefine(k_means_cluster_centers, mbk_means_cluster_centers,
                                                              n_clusters, sepecificFevd)
            #  There is something wrong with the data structure here 
            plt.subplot(5, 5, idx + 1), plt.title(name)
            plt.bar(np.arange(k_means_result.shape[0]), k_means_result.T[0])
            # plt.subplot(5, 10, 2 * idx + 2), plt.title(name)
            # plt.bar(np.arange(k_means_result.shape[0]), mbk_means_result.T[0]), plt.axis('off')
        except np.linalg.LinAlgError as e:
            print(name + ": fail the var model LinAlgError: ", e)
            print(repr(e))

    plt.show()
    pass


def clusteringPeriod(inputs, batch_size=45, n_clusters=5):
    # #############################################################################
    # k means
    k_means = KMeans(init="k-means++", n_clusters=n_clusters, n_init=10)
    t0 = time.time()
    k_means.fit(inputs)
    t_batch = time.time() - t0

    print('cluster_centers of k_means:\n', k_means.cluster_centers_)

    # #############################################################################
    # Compute clustering with MiniBatchKMeans

    mbk = MiniBatchKMeans(
        init="k-means++",
        n_clusters=n_clusters,
        batch_size=batch_size,
        n_init=10,
        max_no_improvement=10,
        verbose=0,
    )
    t0 = time.time()
    mbk.fit(inputs)
    t_mini_batch = time.time() - t0

    print('cluster_centers of MiniBatchKMeans:\n', mbk.cluster_centers_)

    return k_means, t_batch, mbk, t_mini_batch


def clusteringComparingPloting(inputs, batch_size=45, n_clusters=5):
    k_means, t_batch, mbk, t_mini_batch = clusteringPeriod(inputs, batch_size, n_clusters)
    n_clusters = k_means.n_clusters
    # #############################################################################
    # Plot result

    fig = plt.figure(figsize=(8, 3))
    fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
    colors = ["#4EACC5", "#FF9C34", "#4E9A06"]

    # We want to have the same colors for the same cluster from the
    # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
    # closest one.
    k_means_cluster_centers = k_means.cluster_centers_
    order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
    mbk_means_cluster_centers = mbk.cluster_centers_[order]

    #  So this is using pairwise_distances_argmin, Let's classify the clusters , great 
    k_means_labels = pairwise_distances_argmin(inputs, k_means_cluster_centers)
    mbk_means_labels = pairwise_distances_argmin(inputs, mbk_means_cluster_centers)

    # KMeans
    ax = fig.add_subplot(1, 3, 1)
    for k, col in zip(range(n_clusters), colors):
        my_members = k_means_labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(inputs[my_members, 0], inputs[my_members, 1], "w", markerfacecolor=col, marker=".")
        ax.plot(
            cluster_center[0],
            cluster_center[1],
            "o",
            markerfacecolor=col,
            markeredgecolor="k",
            markersize=6,
        )
    ax.set_title("KMeans")
    ax.set_xticks(())
    ax.set_yticks(())
    plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_))

    # MiniBatchKMeans
    ax = fig.add_subplot(1, 3, 2)
    for k, col in zip(range(n_clusters), colors):
        my_members = mbk_means_labels == k
        cluster_center = mbk_means_cluster_centers[k]
        ax.plot(inputs[my_members, 0], inputs[my_members, 1], "w", markerfacecolor=col, marker=".")
        ax.plot(
            cluster_center[0],
            cluster_center[1],
            "o",
            markerfacecolor=col,
            markeredgecolor="k",
            markersize=6,
        )
    ax.set_title("MiniBatchKMeans")
    ax.set_xticks(())
    ax.set_yticks(())
    plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))

    # Initialise the different array to all False
    different = mbk_means_labels == 4
    ax = fig.add_subplot(1, 3, 3)

    for k in range(n_clusters):
        different += (k_means_labels == k) != (mbk_means_labels == k)

    identic = np.logical_not(different)
    ax.plot(inputs[identic, 0], inputs[identic, 1], "w", markerfacecolor="#bbbbbb", marker=".")
    ax.plot(inputs[different, 0], inputs[different, 1], "w", markerfacecolor="m", marker=".")
    ax.set_title("Difference")
    ax.set_xticks(())
    ax.set_yticks(())

    plt.show()

    return k_means_cluster_centers, mbk_means_cluster_centers, n_clusters


#  In short, output a matrix , Every row of the matrix is an element for us to learn 
def unpackData(unpackSepecificItem=-1, enhanceFlag=True):
    Data_base, classNames = GetClasses()
    output = []
    for name in classNames:
        Data = ReadTariffData(Data_base, name)
        Data = ProcessBaseData(Data)
        name = name.strip()
        try:
            if isinstance(output, np.ndarray):
                output = np.concatenate((output, MyIndustry(Data, name).DataJointing(unpackSepecificItem, enhanceFlag)),
                                        axis=0)
            else:
                output = MyIndustry(Data, name).DataJointing(unpackSepecificItem, enhanceFlag)
        except TypeError as e:
            print(name + ": fail the var model, TypeError")
            print(repr(e))
            # warnings.warn(name+": fail the var model")
            continue
        except np.linalg.LinAlgError as e:
            print(name + ": fail the var model LinAlgError: ", e)
            print(repr(e))
        except Exception as e:
            print(name + ": fail the var model, unknown: ", e)
            print(repr(e))
            # warnings.warn(name+": fail the var model")
            continue
    print(output.shape)
    return output


def main():
    data = unpackData(unpackSepecificItem=1, enhanceFlag=True)
    k_means_cluster_centers, mbk_means_cluster_centers, n_clusters = clusteringComparingPloting(data, n_clusters=5)
    industryIdentityShow(k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0)
    pass


if __name__ == '__main__':
    main()

原网站

版权声明
本文为[interval_ package]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/209/202207280958450795.html