当前位置:网站首页>Using k-means clustering to classify tariff models of different industries
Using k-means clustering to classify tariff models of different industries
2022-07-28 10:47:00 【interval_ package】
Here is a cluster of different industries
I have done it before VAR Model fitting , And then use VAR Variance decomposition of ,Fevd As this learning parameter
Share with you
There is reference sklearn Code of official website
Welcome comments if you want to communicate
import matplotlib.pyplot as plt
from VARModelFitting import *
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import scipy.interpolate as spi
import time
class MyIndustry(object):
def __init__(self, Data: pd.DataFrame, name, lag=3):
self.name = name
self.rawDecompInfo = self.fevdIndentity(Data, lag, 10)
self.DataJointing()
# print(name,':\n',self.rawDecompInfo)
pass
@staticmethod
def fevdIndentity(Data, lag, MaxPeriod):
return VARFitter(Data, lag)[1].fevd(MaxPeriod).decomp
def DataJointing(self, sepecificFevd=-1, enhanceFlag=True, enhanceCount=1000):
if sepecificFevd < 0:
output = []
for item in self.rawDecompInfo:
if isinstance(output, np.ndarray):
output = np.concatenate((output, item), axis=0)
else:
output = item
else:
output = self.rawDecompInfo[sepecificFevd]
print("output:\n", output)
if enhanceFlag:
output = self.DataEnhance(output, enhanceCount)
print("enhanceFlag output:\n", output)
return output
@staticmethod
def DataEnhance(data: np.ndarray, _min=-1, _max=-1, enhanceCount=50):
if _min < 0 or _max < 0:
_min = 0
_max = data.shape[0]
output = []
rawData = data.T
periods = np.arange(_min, _max)
amplifiedPeriods = np.linspace(_min, _max, enhanceCount)
for row in rawData:
tck = spi.splrep(periods, row)
result = spi.splev(amplifiedPeriods, tck, der=0)
result = result.reshape(result.shape[0], 1)
if isinstance(output, np.ndarray):
output = np.hstack((output, result))
else:
output = result
return output
def industryIdentityDefine(self, k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0):
inputs = self.DataJointing(sepecificFevd=sepecificFevd, enhanceFlag=False)
# So this is using pairwise_distances_argmin, Let's classify the clusters , great
k_means_labels = pairwise_distances_argmin(inputs, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(inputs, mbk_means_cluster_centers)
k_means_result = np.zeros((n_clusters, 1))
mbk_means_result = np.zeros((n_clusters, 1))
for i in range(n_clusters):
k_means_result[i] = sum(k_means_labels == i)
mbk_means_result[i] = sum(mbk_means_labels == i)
return k_means_result, mbk_means_result
def industryIdentityShow(k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0):
plt.figure("industryIdentityShow", figsize=(20, 16))
Data_base, classNames = GetClasses()
for name, idx in zip(classNames, range(0, len(classNames))):
Data = ReadTariffData(Data_base, name)
Data = ProcessBaseData(Data)
name = name.strip()
try:
k_means_result, mbk_means_result = \
MyIndustry(Data, name).industryIdentityDefine(k_means_cluster_centers, mbk_means_cluster_centers,
n_clusters, sepecificFevd)
# There is something wrong with the data structure here
plt.subplot(5, 5, idx + 1), plt.title(name)
plt.bar(np.arange(k_means_result.shape[0]), k_means_result.T[0])
# plt.subplot(5, 10, 2 * idx + 2), plt.title(name)
# plt.bar(np.arange(k_means_result.shape[0]), mbk_means_result.T[0]), plt.axis('off')
except np.linalg.LinAlgError as e:
print(name + ": fail the var model LinAlgError: ", e)
print(repr(e))
plt.show()
pass
def clusteringPeriod(inputs, batch_size=45, n_clusters=5):
# #############################################################################
# k means
k_means = KMeans(init="k-means++", n_clusters=n_clusters, n_init=10)
t0 = time.time()
k_means.fit(inputs)
t_batch = time.time() - t0
print('cluster_centers of k_means:\n', k_means.cluster_centers_)
# #############################################################################
# Compute clustering with MiniBatchKMeans
mbk = MiniBatchKMeans(
init="k-means++",
n_clusters=n_clusters,
batch_size=batch_size,
n_init=10,
max_no_improvement=10,
verbose=0,
)
t0 = time.time()
mbk.fit(inputs)
t_mini_batch = time.time() - t0
print('cluster_centers of MiniBatchKMeans:\n', mbk.cluster_centers_)
return k_means, t_batch, mbk, t_mini_batch
def clusteringComparingPloting(inputs, batch_size=45, n_clusters=5):
k_means, t_batch, mbk, t_mini_batch = clusteringPeriod(inputs, batch_size, n_clusters)
n_clusters = k_means.n_clusters
# #############################################################################
# Plot result
fig = plt.figure(figsize=(8, 3))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = k_means.cluster_centers_
order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
mbk_means_cluster_centers = mbk.cluster_centers_[order]
# So this is using pairwise_distances_argmin, Let's classify the clusters , great
k_means_labels = pairwise_distances_argmin(inputs, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(inputs, mbk_means_cluster_centers)
# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
ax.plot(inputs[my_members, 0], inputs[my_members, 1], "w", markerfacecolor=col, marker=".")
ax.plot(
cluster_center[0],
cluster_center[1],
"o",
markerfacecolor=col,
markeredgecolor="k",
markersize=6,
)
ax.set_title("KMeans")
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_))
# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
my_members = mbk_means_labels == k
cluster_center = mbk_means_cluster_centers[k]
ax.plot(inputs[my_members, 0], inputs[my_members, 1], "w", markerfacecolor=col, marker=".")
ax.plot(
cluster_center[0],
cluster_center[1],
"o",
markerfacecolor=col,
markeredgecolor="k",
markersize=6,
)
ax.set_title("MiniBatchKMeans")
ax.set_xticks(())
ax.set_yticks(())
plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))
# Initialise the different array to all False
different = mbk_means_labels == 4
ax = fig.add_subplot(1, 3, 3)
for k in range(n_clusters):
different += (k_means_labels == k) != (mbk_means_labels == k)
identic = np.logical_not(different)
ax.plot(inputs[identic, 0], inputs[identic, 1], "w", markerfacecolor="#bbbbbb", marker=".")
ax.plot(inputs[different, 0], inputs[different, 1], "w", markerfacecolor="m", marker=".")
ax.set_title("Difference")
ax.set_xticks(())
ax.set_yticks(())
plt.show()
return k_means_cluster_centers, mbk_means_cluster_centers, n_clusters
# In short, output a matrix , Every row of the matrix is an element for us to learn
def unpackData(unpackSepecificItem=-1, enhanceFlag=True):
Data_base, classNames = GetClasses()
output = []
for name in classNames:
Data = ReadTariffData(Data_base, name)
Data = ProcessBaseData(Data)
name = name.strip()
try:
if isinstance(output, np.ndarray):
output = np.concatenate((output, MyIndustry(Data, name).DataJointing(unpackSepecificItem, enhanceFlag)),
axis=0)
else:
output = MyIndustry(Data, name).DataJointing(unpackSepecificItem, enhanceFlag)
except TypeError as e:
print(name + ": fail the var model, TypeError")
print(repr(e))
# warnings.warn(name+": fail the var model")
continue
except np.linalg.LinAlgError as e:
print(name + ": fail the var model LinAlgError: ", e)
print(repr(e))
except Exception as e:
print(name + ": fail the var model, unknown: ", e)
print(repr(e))
# warnings.warn(name+": fail the var model")
continue
print(output.shape)
return output
def main():
data = unpackData(unpackSepecificItem=1, enhanceFlag=True)
k_means_cluster_centers, mbk_means_cluster_centers, n_clusters = clusteringComparingPloting(data, n_clusters=5)
industryIdentityShow(k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0)
pass
if __name__ == '__main__':
main()
边栏推荐
- c语言进阶篇:指针(一)
- 20200229训练赛 L1 - 2 删除字符串中的子串 (20分)
- PyQt5快速开发与实战 4.13 菜单栏、工具栏与状态栏 and 4.14 QPrinter
- GKSphereObstacle
- SQL Server 2016 learning record - nested query
- Read write separation standby backup error
- 2019年9月PAT甲级题目
- GKNoise
- Add new startup logo and startup / shutdown animation in mt6735
- Chapter 1: cross end development of small programs of uniapp ----- create a uniapp project
猜你喜欢

SDUT Round 9 2020 Spring Festival campaign

生成对抗网络在DeepFake中的未来
![[application of stack] - infix expression to suffix expression](/img/c1/879716342f6dd5eaa8b79c752eca16.png)
[application of stack] - infix expression to suffix expression

Semeval 2022 | introducing knowledge into ner system, aridamo academy won the best paper award

SemEval 2022 | 将知识引入NER系统,阿里达摩院获最佳论文奖

Aike AI frontier promotion (7.28)

Excel word 简单 技巧 整理(持续更新 大概~)

GKConstantNoiseSource

3、MapReduce详解与源码分析

8、Yarn系统架构与原理详解
随机推荐
ACM winter vacation training 7
机器学习--手写英文字母2--导入与处理数据
GKARC4RandomSource
GKNoiseSource
Pyqt5 rapid development and practice 4.13 menu bar, toolbar and status bar and 4.14 qprinter
andorid 开发
GKNoiseSource
最短路专题
Advanced C language: pointer (1)
GKRidgedNoiseSource
SQL Server 2016 learning record - Data Definition
Use of Ogg parameter filter [urgent]
ACM寒假集训#4
GKCheckerboardNoiseSource
Redis-day01 common sense supplement and redis introduction
6、MapReduce自定义分区实现
Qt生成.exe文件 并 在无Qt环境下运行(Enigma Virtual Box进行绿色可执行软件封装)图文教程
20200229 training race L2 - 2 tree species Statistics (25 points)
GKSphereObstacle
粒子群解决tsp的技术问题