当前位置：网站首页>Using cluster analysis to build a credit card high-risk customer identification model

Using cluster analysis to build a credit card high-risk customer identification model

2022-07-23 11:47:00 【mandala -chen】

Title Description

Insert picture description here

Import data and view datasets

"""
import pandas as pd
import numpy as np
data=pd.read_csv("credit_card.csv")
# View the data set ：
data.info()
data.describe()

Construct historical behavior characteristics

### Historical behavior characteristics 
# Divide the historical behavior characteristic data 
data_active=data.iloc[:,[2,3,4,6,7,8]]
# Define clustering characteristic data 
data_means=data.iloc[:,[0]]
# View the data set of historical behavior characteristics ：
data_active.describe()
data_active.info()
# Calculate the score   The lower the value , The lower the score 
def GetScore(x):
    if x>=2:
        a =0
    else:
        a =1
    return a
score_1=data_active[' Defective account '].apply(GetScore)
score_2=data_active[' Within the time limit '].apply(GetScore)
score_3=data_active[' Bad debts '].apply(GetScore)
score_4=data_active[' To refund a ticket '].apply(GetScore)
score_5=data_active[' Refuse to record '].apply(GetScore)
score_6=data_active[' Forced card stop record '].apply(GetScore)
# To sum by weight 
data_means.loc[:,'history_credit_risk']=score_1+score_2*2+score_3*3+score_4*3+score_5*3+score_6

Build the characteristics of economic risk situation

### Characteristics of economic risk situation 
# Divide the characteristic data of economic risk situation 
data_encomic=data.iloc[:,[5,18,19,21,22]]
# Check the characteristic data of economic risk 
data_encomic.describe()
data_encomic.info()
# Loan limit score 
def GetScore_encomic(x):
    if x>=2:
        a =1
    else:
        a =0
    return a
score_yu=data_encomic[' Monthly credit card amount '].apply(GetScore_encomic)
# Personal monthly income score 

data_person=data_encomic[' Personal monthly income ']/data_encomic[' Personal monthly expenses ']
data_person_Scores=[]
for i in range(data_encomic.shape[0]):
    if data_person[i]<1:
        data_person_Scores.append(0)
    else:
        data_person_Scores.append(1)
# Family monthly income score 
data_mouth=data_encomic[' Family monthly income ']/data_encomic[' Monthly credit card amount ']
data_mouth_Scores=[]
for i in range(data_encomic.shape[0]):
    if data_person[i]<1:
        data_mouth_Scores.append(0)
    else:
        data_mouth_Scores.append(1)
        
data_means['economic_risk']=np.array(data_mouth_Scores) + np.array(data_person_Scores)+np.array(score_yu)

Build the characteristics of income risk situation

### Characteristics of income risk situation 
data_shouru=data.iloc[:,[14,17,20]]
# Check the characteristic data of income risk 
data_shouru.describe()
data_shouru.info()
# Home rating 
HouseScore = []
for i in range(data_shouru.shape[0]):
    if 3 <= data_shouru.loc[i, ' Home '] <= 5:
        HouseScore.append(0)
    else:
        HouseScore.append(1)
# Career rating 
JobScore = []
for i in range(data_shouru.shape[0]):
    if(data_shouru.loc[i, ' occupation '] <= 7) | (data_shouru.loc[i, ' occupation '] == 19) | (data_shouru.loc[i, ' occupation '] == 21):
        JobScore.append(2)
    if(data_shouru.loc[i, ' occupation '] >= 8) & (data_shouru.loc[i, ' occupation '] <= 11):
            JobScore.append(1)
    if(data_shouru.loc[i,  ' occupation ']  <=  18)  &  (data_shouru.loc[i,  ' occupation ']  >=  12)  |  (data_shouru.loc[i,  ' occupation ']  ==  20)  | (data_shouru.loc[i, ' occupation '] == 22):
            JobScore.append(0)
# Age rating 
AgeScore = []
for i in range(data_shouru.shape[0]):
    if data_shouru.loc[i, ' Age '] <= 2:
        AgeScore.append(1)
    else:
        AgeScore.append(0)
data_means['income_risk'] = np.array(HouseScore) + np.array(JobScore) + np.array(AgeScore)

Clustering analysis

#### Clustering analysis 
# Three characteristic data sets constructed in standardization 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
data_means_stander = sc_X.fit_transform(data_means.iloc[:,[1,2,3]])
# Clustering training 
from sklearn.cluster import KMeans # Import kmeans Algorithm 
k = 5 ##  Determine the number of cluster centers 
# Build the model 
kmeans_model = KMeans(n_clusters = k,n_jobs=4,random_state=123)
fit_kmeans = kmeans_model.fit(data_means_stander)   # model training 
# Ranking of customer characteristics analysis 
data_means['count']=data_means['income_risk']+data_means['economic_risk']+data_means.loc[:,'history_credit_risk']
sort_values=data_means.sort_values("count",inplace=False)

# Look at the cluster center 
kmeans_model.cluster_centers_ 


# Save the category label of the customer 
data_means['lable']=kmeans_model.labels_ 

# Count the number of samples in different categories 
r1 = pd.Series(kmeans_model.labels_).value_counts()


data_means.to_csv("mean.csv",index=False,sep=',', encoding="utf_8_sig")