当前位置:网站首页>Financial risk control practice -- feature derivation based on time series
Financial risk control practice -- feature derivation based on time series
2022-07-05 06:28:00 【Grateful_ Dead424】
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
I wrote it for you 35 A function , Let's look at it one by one
# lately p Months ,inv>0 Number of months
def Num(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.where(df>0,1,0).sum(axis=1)
return inv+'_num'+str(p),auto_value
# lately p Months ,inv=0 Number of months
def Nmz(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.where(df==0,1,0).sum(axis=1)
return inv+'_nmz'+str(p),auto_value
# lately p Months ,inv>0 Is the number of months >=1
def Evr(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
arr=np.where(df>0,1,0).sum(axis=1)
auto_value = np.where(arr,1,0)
return inv+'_evr'+str(p),auto_value
# lately p Months ,inv mean value
def Avg(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanmean(df,axis = 1 )
return inv+'_avg'+str(p),auto_value
# lately p Months ,inv and
def Tot(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nansum(df,axis = 1)
return inv+'_tot'+str(p),auto_value
# lately (2,p+1) Months ,inv and
def Tot2T(inv,p):
df=data.loc[:,inv+'2':inv+str(p+1)]
auto_value=df.sum(1)
return inv+'_tot2t'+str(p),auto_value
# lately p Months ,inv Maximum
def Max(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanmax(df,axis = 1)
return inv+'_max'+str(p),auto_value
# lately p Months ,inv minimum value
def Min(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanmin(df,axis = 1)
return inv+'_min'+str(p),auto_value
# lately p Months , The last time inv>0 The number of months to now
def Msg(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
df_value=np.where(df>0,1,0)
auto_value=[]
for i in range(len(df_value)):
row_value=df_value[i,:]
if row_value.max()<=0:
indexs='0'
auto_value.append(indexs)
else:
indexs=1
for j in row_value:
if j>0:
break
indexs+=1
auto_value.append(indexs)
return inv+'_msg'+str(p),auto_value
# lately p Months , The last time inv=0 The number of months to now
def Msz(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
df_value=np.where(df==0,1,0)
auto_value=[]
for i in range(len(df_value)):
row_value=df_value[i,:]
if row_value.max()<=0:
indexs='0'
auto_value.append(indexs)
else:
indexs=1
for j in row_value:
if j>0:
break
indexs+=1
auto_value.append(indexs)
return inv+'_msz'+str(p),auto_value
# During the month inv/( lately p Months inv The average of )
def Cav(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1']/np.nanmean(df,axis = 1 )
return inv+'_cav'+str(p),auto_value
# During the month inv/( lately p Months inv The minimum value of )
def Cmn(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1']/np.nanmin(df,axis = 1 )
return inv+'_cmn'+str(p),auto_value
# lately p Months , Every two months inv The maximum growth of
def Mai(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
auto_value.append(np.nanmax(value_lst))
return inv+'_mai'+str(p),auto_value
# lately p Months , Every two months inv The maximum reduction of
def Mad(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k+1] - df_value[k]
value_lst.append(minus)
auto_value.append(np.nanmax(value_lst))
return inv+'_mad'+str(p),auto_value
# lately p Months ,inv Standard deviation
def Std(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanstd(df,axis = 1)
return inv+'_std'+str(p),auto_value
# lately p Months ,inv The coefficient of variation of
def Cva(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanstd(df,axis = 1 )/np.nanmean(df,axis = 1)
return inv+'_cva'+str(p),auto_value
#( During the month inv) - ( lately p Months inv The average of )
def Cmm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1'] - np.nanmean(df,axis = 1 )
return inv+'_cmm'+str(p),auto_value
#( During the month inv) - ( lately p Months inv The minimum value of )
def Cnm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1'] - np.nanmin(df,axis = 1 )
return inv+'_cnm'+str(p),auto_value
#( During the month inv) - ( lately p Months inv The maximum of )
def Cxm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1'] - np.nanmax(df,axis = 1 )
return inv+'_cxm'+str(p),auto_value
#( ( During the month inv) - ( lately p Months inv The maximum of ) ) / ( lately p Months inv The maximum of ) )
def Cxp(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
temp = np.nanmin(df,axis = 1 )
auto_value = (df[inv+'1'] - temp )/ temp
return inv+'_cxp'+str(p),auto_value
# lately p Months ,inv It's very bad
def Ran(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = np.nanmax(df,axis = 1 ) - np.nanmin(df,axis = 1 )
return inv+'_ran'+str(p),auto_value
# lately min( Time on book,p ) Months , The number of months increased in the latter month compared with the previous month
def Nci(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
value_ng = np.where(np.array(value_lst)>0,1,0).sum()
auto_value.append(np.nanmax(value_ng))
return inv+'_nci'+str(p),auto_value
# lately min( Time on book,p ) Months , The number of months decreased in the latter month compared with the previous month
def Ncd(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
value_ng = np.where(np.array(value_lst)<0,1,0).sum()
auto_value.append(np.nanmax(value_ng))
return inv+'_ncd'+str(p),auto_value
# lately min( Time on book,p ) Months , Adjacent months inv Equal number of months
def Ncn(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
value_ng = np.where(np.array(value_lst)==0,1,0).sum()
auto_value.append(np.nanmax(value_ng))
return inv+'_ncn'+str(p),auto_value
#If lately min( Time on book,p ) Months , For any month i , There are inv[i] > inv[i+1] ,
# Strictly increasing , And inv > 0 be flag = 1 Else flag = 0
def Bup(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
index = 0
for k in range(len(df_value)-1):
if df_value[k] > df_value[k+1]:
break
index =+ 1
if index == p:
value= 1
else:
value = 0
auto_value.append(value)
return inv+'_bup'+str(p),auto_value
#If lately min( Time on book,p ) Months , For any month i , There are inv[i] < inv[i+1] ,
# Strictly decreasing , And inv > 0 be flag = 1 Else flag = 0
def Pdn(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
index = 0
for k in range(len(df_value)-1):
if df_value[k+1] > df_value[k]:
break
index =+ 1
if index == p:
value= 1
else:
value = 0
auto_value.append(value)
return inv+'_pdn'+str(p),auto_value
# lately min( Time on book,p ) Months ,inv Pruning average of
def Trm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = []
for i in range(len(df)):
trm_mean = list(df.loc[i,:])
trm_mean.remove(np.nanmax(trm_mean))
trm_mean.remove(np.nanmin(trm_mean))
temp=np.nanmean(trm_mean)
auto_value.append(temp)
return inv+'_trm'+str(p),auto_value
#( During the month inv - lately p Months inv Maximum ) / lately p Months inv Maximum of
def Cmx(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = (df[inv+'1'] - np.nanmax(df,axis = 1 )) /np.nanmax(df,axis = 1 )
return inv+'_cmx'+str(p),auto_value
#( During the month inv - lately p Months inv mean value ) / inv mean value
def Cmp(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = (df[inv+'1'] - np.nanmean(df,axis = 1 )) /np.nanmean(df,axis = 1 )
return inv+'_cmp'+str(p),auto_value
#( During the month inv - lately p Months inv minimum value ) /inv minimum value
def Cnp(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = (df[inv+'1'] - np.nanmin(df,axis = 1 )) /np.nanmin(df,axis = 1 )
return inv+'_cnp'+str(p),auto_value
# lately min( Time on book,p ) The number of months from the current month with the maximum value of months
def Msx(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
df['_max'] = np.nanmax(df,axis = 1)
for i in range(1,p+1):
df[inv+str(i)] = list(df[inv+str(i)] == df['_max'])
del df['_max']
df_value = np.where(df==True,1,0)
auto_value=[]
for i in range(len(df_value)):
row_value=df_value[i,:]
indexs=1
for j in row_value:
if j == 1:
break
indexs+=1
auto_value.append(indexs)
return inv+'_msx'+str(p),auto_value
# lately p The average of months /((p,2p) Months inv mean value )
def Rpp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmean(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmean(df2,axis = 1 )
auto_value = value1/value2
return inv+'_rpp'+str(p),auto_value
# lately p The average of months - ((p,2p) Months inv mean value )
def Dpp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmean(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmean(df2,axis = 1 )
auto_value = value1 - value2
return inv+'_dpp'+str(p),auto_value
#( lately p Months inv Maximum )/ ( lately (p,2p) Months inv Maximum )
def Mpp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmax(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmax(df2,axis = 1 )
auto_value = value1/value2
return inv+'_mpp'+str(p),auto_value
#( lately p Months inv minimum value )/ ( lately (p,2p) Months inv minimum value )
def Npp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmin(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmin(df2,axis = 1 )
auto_value = value1/value2
return inv+'_npp'+str(p),auto_value
Let's define a function , Directly encapsulate all the above functions
# First, execute all the following functions
# Define functions that call two parameters in batches
def auto_var2(inv,p):
#global data_new
try:
columns_name,values=Num(inv,p)
data_new[columns_name]=values
except:
print("Num PARSE ERROR",inv,p)
try:
columns_name,values=Nmz(inv,p)
data_new[columns_name]=values
except:
print("Nmz PARSE ERROR",inv,p)
try:
columns_name,values=Evr(inv,p)
data_new[columns_name]=values
except:
print("Evr PARSE ERROR",inv,p)
try:
columns_name,values=Avg(inv,p)
data_new[columns_name]=values
except:
print("Avg PARSE ERROR",inv,p)
try:
columns_name,values=Tot(inv,p)
data_new[columns_name]=values
except:
print("Tot PARSE ERROR",inv,p)
try:
columns_name,values=Tot2T(inv,p)
data_new[columns_name]=values
except:
print("Tot2T PARSE ERROR",inv,p)
try:
columns_name,values=Max(inv,p)
data_new[columns_name]=values
except:
print("Max PARSE ERROR",inv,p)
try:
columns_name,values=Min(inv,p)
data_new[columns_name]=values
except:
print("Min PARSE ERROR",inv,p)
try:
columns_name,values=Msg(inv,p)
data_new[columns_name]=values
except:
print("Msg PARSE ERROR",inv,p)
try:
columns_name,values=Msz(inv,p)
data_new[columns_name]=values
except:
print("Msz PARSE ERROR",inv,p)
try:
columns_name,values=Cav(inv,p)
data_new[columns_name]=values
except:
print("Cav PARSE ERROR",inv,p)
try:
columns_name,values=Cmn(inv,p)
data_new[columns_name]=values
except:
print("Cmn PARSE ERROR",inv,p)
try:
columns_name,values=Mai(inv,p)
data_new[columns_name]=values
except:
print("Mai PARSE ERROR",inv,p)
try:
columns_name,values=Mad(inv,p)
data_new[columns_name]=values
except:
print("Mad PARSE ERROR",inv,p)
try:
columns_name,values=Std(inv,p)
data_new[columns_name]=values
except:
print("Std PARSE ERROR",inv,p)
try:
columns_name,values=Cva(inv,p)
data_new[columns_name]=values
except:
print("Cva PARSE ERROR",inv,p)
try:
columns_name,values=Cmm(inv,p)
data_new[columns_name]=values
except:
print("Cmm PARSE ERROR",inv,p)
try:
columns_name,values=Cnm(inv,p)
data_new[columns_name]=values
except:
print("Cnm PARSE ERROR",inv,p)
try:
columns_name,values=Cxm(inv,p)
data_new[columns_name]=values
except:
print("Cxm PARSE ERROR",inv,p)
try:
columns_name,values=Cxp(inv,p)
data_new[columns_name]=values
except:
print("Cxp PARSE ERROR",inv,p)
try:
columns_name,values=Ran(inv,p)
data_new[columns_name]=values
except:
print("Ran PARSE ERROR",inv,p)
try:
columns_name,values=Nci(inv,p)
data_new[columns_name]=values
except:
print("Nci PARSE ERROR",inv,p)
try:
columns_name,values=Ncd(inv,p)
data_new[columns_name]=values
except:
print("Ncd PARSE ERROR",inv,p)
try:
columns_name,values=Ncn(inv,p)
data_new[columns_name]=values
except:
print("Ncn PARSE ERROR",inv,p)
try:
columns_name,values=Bup(inv,p)
data_new[columns_name]=values
except:
print("Bup PARSE ERROR",inv,p)
try:
columns_name,values=Pdn(inv,p)
data_new[columns_name]=values
except:
print("Pdn PARSE ERROR",inv,p)
try:
columns_name,values=Trm(inv,p)
data_new[columns_name]=values
except:
print("Trm PARSE ERROR",inv,p)
try:
columns_name,values=Cmx(inv,p)
data_new[columns_name]=values
except:
print("Cmx PARSE ERROR",inv,p)
try:
columns_name,values=Cmp(inv,p)
data_new[columns_name]=values
except:
print("Cmp PARSE ERROR",inv,p)
try:
columns_name,values=Cnp(inv,p)
data_new[columns_name]=values
except:
print("Cnp PARSE ERROR",inv,p)
try:
columns_name,values=Msx(inv,p)
data_new[columns_name]=values
except:
print("Msx PARSE ERROR",inv,p)
try:
columns_name,values=Rpp(inv,p)
data_new[columns_name]=values
except:
print("Rpp PARSE ERROR",inv,p)
try:
columns_name,values=Dpp(inv,p)
data_new[columns_name]=values
except:
print("Dpp PARSE ERROR",inv,p)
try:
columns_name,values=Mpp(inv,p)
data_new[columns_name]=values
except:
print("Mpp PARSE ERROR",inv,p)
try:
columns_name,values=Npp(inv,p)
data_new[columns_name]=values
except:
print("Npp PARSE ERROR",inv,p)
return data_new.columns.size
Then we use a small demo Let's experiment
import pandas as pd
#data It is the original data set with features and labels
data = pd.read_excel('/Users/zhucan/Desktop/ Financial risk control practice / Lesson 3 materials /textdata.xlsx')
data
""" ft and gt Represents two variable names 1-12 Indicates correspondence 12 The corresponding value for each month of the month """
'''ft1 refer to The number of refuelling times calculated from the data within one month from the day of application '''
'''gt1 refer to The refueling amount calculated from the data within one month from the day of application '''
data.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
# 'ft9', 'ft10', 'ft11', 'ft12', 'TOB', 'gt1', 'gt2', 'gt3', 'gt4', 'gt5',
# 'gt6', 'gt7', 'gt8', 'gt9', 'gt10', 'gt11', 'gt12'],
# dtype='object')
data_new = data.copy()
p = 4
inv = 'ft'
auto_data = pd.DataFrame()
for p in range(1,13):
for inv in ['ft','gt']:
auto_var2(inv,p)
# Mai PARSE ERROR ft 1
# Mad PARSE ERROR ft 1
# Trm PARSE ERROR ft 1
# Mai PARSE ERROR gt 1
# Mad PARSE ERROR gt 1
# Trm PARSE ERROR gt 1
# Trm PARSE ERROR ft 2
# Trm PARSE ERROR gt 2
# Rpp PARSE ERROR ft 7
# Dpp PARSE ERROR ft 7
# Mpp PARSE ERROR ft 7
# Npp PARSE ERROR ft 7
# Rpp PARSE ERROR gt 7
# Dpp PARSE ERROR gt 7
# Mpp PARSE ERROR gt 7
# Npp PARSE ERROR gt 7
# Rpp PARSE ERROR ft 8
# Dpp PARSE ERROR ft 8
# Mpp PARSE ERROR ft 8
# Npp PARSE ERROR ft 8
# Rpp PARSE ERROR gt 8
# Dpp PARSE ERROR gt 8
# Mpp PARSE ERROR gt 8
# Npp PARSE ERROR gt 8
# Rpp PARSE ERROR ft 9
# Dpp PARSE ERROR ft 9
# Mpp PARSE ERROR ft 9
# Npp PARSE ERROR ft 9
# Rpp PARSE ERROR gt 9
# Dpp PARSE ERROR gt 9
# Mpp PARSE ERROR gt 9
# Npp PARSE ERROR gt 9
# Rpp PARSE ERROR ft 10
# Dpp PARSE ERROR ft 10
# Mpp PARSE ERROR ft 10
# Npp PARSE ERROR ft 10
# Rpp PARSE ERROR gt 10
# Dpp PARSE ERROR gt 10
# Mpp PARSE ERROR gt 10
# Npp PARSE ERROR gt 10
# Rpp PARSE ERROR ft 11
# Dpp PARSE ERROR ft 11
# Mpp PARSE ERROR ft 11
# Npp PARSE ERROR ft 11
# Rpp PARSE ERROR gt 11
# Dpp PARSE ERROR gt 11
# Mpp PARSE ERROR gt 11
# Npp PARSE ERROR gt 11
# Tot2T PARSE ERROR ft 12
# Rpp PARSE ERROR ft 12
# Dpp PARSE ERROR ft 12
# Mpp PARSE ERROR ft 12
# Npp PARSE ERROR ft 12
# Tot2T PARSE ERROR gt 12
# Rpp PARSE ERROR gt 12
# Dpp PARSE ERROR gt 12
# Mpp PARSE ERROR gt 12
# Npp PARSE ERROR gt 12
data_new.shape
#(5, 808)
data_new.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
# 'ft9',
# ...
# 'gt_nci12', 'gt_ncd12', 'gt_ncn12', 'gt_bup12', 'gt_pdn12', 'gt_trm12',
# 'gt_cmx12', 'gt_cmp12', 'gt_cnp12', 'gt_msx12'],
# dtype='object', length=808)
边栏推荐
- Alibaba established the enterprise digital intelligence service company "Lingyang" to focus on enterprise digital growth
- What is socket? Basic introduction to socket
- MySQL advanced part 1: index
- [moviepy] unable to find a solution for exe
- La redirection de l'applet Wechat ne déclenche pas onload
- MPLS experiment
- 5. Oracle TABLESPACE
- 背包问题 AcWing 9. 分组背包问题
- MySQL advanced part 2: the use of indexes
- International Open Source firmware Foundation (osff) organization
猜你喜欢
什么是套接字?Socket基本介绍
Idea debug failed
Find the combination number acwing 887 Find combination number III
MySQL advanced part 1: View
博弈论 AcWing 894. 拆分-Nim游戏
1.13 - RISC/CISC
高斯消元 AcWing 884. 高斯消元解异或線性方程組
WordPress switches the page, and the domain name changes back to the IP address
博弈论 AcWing 893. 集合-Nim游戏
[2021]IBRNet: Learning Multi-View Image-Based Rendering Qianqian
随机推荐
Leetcode recursion
How to correctly ask questions in CSDN Q & A
MySQL advanced part 2: optimizing SQL steps
ADG5412FBRUZ-RL7应用 双电源模拟开关和多路复用器IC
TypeScript入门
How to understand the definition of sequence limit?
Client use of Argo CD installation
__ builtin_ Popcount() counts the number of 1s, which are commonly used in bit operations
阿里巴巴成立企业数智服务公司“瓴羊”,聚焦企业数字化增长
Usage scenarios of golang context
Paper reading report
求组合数 AcWing 888. 求组合数 IV
[learning] database: MySQL query conditions have functions that lead to index failure. Establish functional indexes
高斯消元 AcWing 884. 高斯消元解异或线性方程组
Leetcode backtracking method
Bit of MySQL_ OR、BIT_ Count function
Relevant information of National Natural Science Foundation of China
博弈论 AcWing 891. Nim游戏
P2575 master fight
Simple selection sort of selection sort