当前位置:网站首页>Financial risk control practice -- feature derivation based on time series
Financial risk control practice -- feature derivation based on time series
2022-07-05 06:28:00 【Grateful_ Dead424】
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
I wrote it for you 35 A function , Let's look at it one by one
# lately p Months ,inv>0 Number of months
def Num(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.where(df>0,1,0).sum(axis=1)
return inv+'_num'+str(p),auto_value
# lately p Months ,inv=0 Number of months
def Nmz(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.where(df==0,1,0).sum(axis=1)
return inv+'_nmz'+str(p),auto_value
# lately p Months ,inv>0 Is the number of months >=1
def Evr(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
arr=np.where(df>0,1,0).sum(axis=1)
auto_value = np.where(arr,1,0)
return inv+'_evr'+str(p),auto_value
# lately p Months ,inv mean value
def Avg(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanmean(df,axis = 1 )
return inv+'_avg'+str(p),auto_value
# lately p Months ,inv and
def Tot(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nansum(df,axis = 1)
return inv+'_tot'+str(p),auto_value
# lately (2,p+1) Months ,inv and
def Tot2T(inv,p):
df=data.loc[:,inv+'2':inv+str(p+1)]
auto_value=df.sum(1)
return inv+'_tot2t'+str(p),auto_value
# lately p Months ,inv Maximum
def Max(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanmax(df,axis = 1)
return inv+'_max'+str(p),auto_value
# lately p Months ,inv minimum value
def Min(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanmin(df,axis = 1)
return inv+'_min'+str(p),auto_value
# lately p Months , The last time inv>0 The number of months to now
def Msg(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
df_value=np.where(df>0,1,0)
auto_value=[]
for i in range(len(df_value)):
row_value=df_value[i,:]
if row_value.max()<=0:
indexs='0'
auto_value.append(indexs)
else:
indexs=1
for j in row_value:
if j>0:
break
indexs+=1
auto_value.append(indexs)
return inv+'_msg'+str(p),auto_value
# lately p Months , The last time inv=0 The number of months to now
def Msz(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
df_value=np.where(df==0,1,0)
auto_value=[]
for i in range(len(df_value)):
row_value=df_value[i,:]
if row_value.max()<=0:
indexs='0'
auto_value.append(indexs)
else:
indexs=1
for j in row_value:
if j>0:
break
indexs+=1
auto_value.append(indexs)
return inv+'_msz'+str(p),auto_value
# During the month inv/( lately p Months inv The average of )
def Cav(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1']/np.nanmean(df,axis = 1 )
return inv+'_cav'+str(p),auto_value
# During the month inv/( lately p Months inv The minimum value of )
def Cmn(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1']/np.nanmin(df,axis = 1 )
return inv+'_cmn'+str(p),auto_value
# lately p Months , Every two months inv The maximum growth of
def Mai(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
auto_value.append(np.nanmax(value_lst))
return inv+'_mai'+str(p),auto_value
# lately p Months , Every two months inv The maximum reduction of
def Mad(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k+1] - df_value[k]
value_lst.append(minus)
auto_value.append(np.nanmax(value_lst))
return inv+'_mad'+str(p),auto_value
# lately p Months ,inv Standard deviation
def Std(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanstd(df,axis = 1)
return inv+'_std'+str(p),auto_value
# lately p Months ,inv The coefficient of variation of
def Cva(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value=np.nanstd(df,axis = 1 )/np.nanmean(df,axis = 1)
return inv+'_cva'+str(p),auto_value
#( During the month inv) - ( lately p Months inv The average of )
def Cmm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1'] - np.nanmean(df,axis = 1 )
return inv+'_cmm'+str(p),auto_value
#( During the month inv) - ( lately p Months inv The minimum value of )
def Cnm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1'] - np.nanmin(df,axis = 1 )
return inv+'_cnm'+str(p),auto_value
#( During the month inv) - ( lately p Months inv The maximum of )
def Cxm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = df[inv+'1'] - np.nanmax(df,axis = 1 )
return inv+'_cxm'+str(p),auto_value
#( ( During the month inv) - ( lately p Months inv The maximum of ) ) / ( lately p Months inv The maximum of ) )
def Cxp(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
temp = np.nanmin(df,axis = 1 )
auto_value = (df[inv+'1'] - temp )/ temp
return inv+'_cxp'+str(p),auto_value
# lately p Months ,inv It's very bad
def Ran(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = np.nanmax(df,axis = 1 ) - np.nanmin(df,axis = 1 )
return inv+'_ran'+str(p),auto_value
# lately min( Time on book,p ) Months , The number of months increased in the latter month compared with the previous month
def Nci(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
value_ng = np.where(np.array(value_lst)>0,1,0).sum()
auto_value.append(np.nanmax(value_ng))
return inv+'_nci'+str(p),auto_value
# lately min( Time on book,p ) Months , The number of months decreased in the latter month compared with the previous month
def Ncd(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
value_ng = np.where(np.array(value_lst)<0,1,0).sum()
auto_value.append(np.nanmax(value_ng))
return inv+'_ncd'+str(p),auto_value
# lately min( Time on book,p ) Months , Adjacent months inv Equal number of months
def Ncn(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
for k in range(len(df_value)-1):
minus = df_value[k] - df_value[k+1]
value_lst.append(minus)
value_ng = np.where(np.array(value_lst)==0,1,0).sum()
auto_value.append(np.nanmax(value_ng))
return inv+'_ncn'+str(p),auto_value
#If lately min( Time on book,p ) Months , For any month i , There are inv[i] > inv[i+1] ,
# Strictly increasing , And inv > 0 be flag = 1 Else flag = 0
def Bup(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
index = 0
for k in range(len(df_value)-1):
if df_value[k] > df_value[k+1]:
break
index =+ 1
if index == p:
value= 1
else:
value = 0
auto_value.append(value)
return inv+'_bup'+str(p),auto_value
#If lately min( Time on book,p ) Months , For any month i , There are inv[i] < inv[i+1] ,
# Strictly decreasing , And inv > 0 be flag = 1 Else flag = 0
def Pdn(inv,p):
arr=np.array(data.loc[:,inv+'1':inv+str(p)])
auto_value = []
for i in range(len(arr)):
df_value = arr[i,:]
value_lst = []
index = 0
for k in range(len(df_value)-1):
if df_value[k+1] > df_value[k]:
break
index =+ 1
if index == p:
value= 1
else:
value = 0
auto_value.append(value)
return inv+'_pdn'+str(p),auto_value
# lately min( Time on book,p ) Months ,inv Pruning average of
def Trm(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = []
for i in range(len(df)):
trm_mean = list(df.loc[i,:])
trm_mean.remove(np.nanmax(trm_mean))
trm_mean.remove(np.nanmin(trm_mean))
temp=np.nanmean(trm_mean)
auto_value.append(temp)
return inv+'_trm'+str(p),auto_value
#( During the month inv - lately p Months inv Maximum ) / lately p Months inv Maximum of
def Cmx(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = (df[inv+'1'] - np.nanmax(df,axis = 1 )) /np.nanmax(df,axis = 1 )
return inv+'_cmx'+str(p),auto_value
#( During the month inv - lately p Months inv mean value ) / inv mean value
def Cmp(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = (df[inv+'1'] - np.nanmean(df,axis = 1 )) /np.nanmean(df,axis = 1 )
return inv+'_cmp'+str(p),auto_value
#( During the month inv - lately p Months inv minimum value ) /inv minimum value
def Cnp(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
auto_value = (df[inv+'1'] - np.nanmin(df,axis = 1 )) /np.nanmin(df,axis = 1 )
return inv+'_cnp'+str(p),auto_value
# lately min( Time on book,p ) The number of months from the current month with the maximum value of months
def Msx(inv,p):
df=data.loc[:,inv+'1':inv+str(p)]
df['_max'] = np.nanmax(df,axis = 1)
for i in range(1,p+1):
df[inv+str(i)] = list(df[inv+str(i)] == df['_max'])
del df['_max']
df_value = np.where(df==True,1,0)
auto_value=[]
for i in range(len(df_value)):
row_value=df_value[i,:]
indexs=1
for j in row_value:
if j == 1:
break
indexs+=1
auto_value.append(indexs)
return inv+'_msx'+str(p),auto_value
# lately p The average of months /((p,2p) Months inv mean value )
def Rpp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmean(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmean(df2,axis = 1 )
auto_value = value1/value2
return inv+'_rpp'+str(p),auto_value
# lately p The average of months - ((p,2p) Months inv mean value )
def Dpp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmean(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmean(df2,axis = 1 )
auto_value = value1 - value2
return inv+'_dpp'+str(p),auto_value
#( lately p Months inv Maximum )/ ( lately (p,2p) Months inv Maximum )
def Mpp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmax(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmax(df2,axis = 1 )
auto_value = value1/value2
return inv+'_mpp'+str(p),auto_value
#( lately p Months inv minimum value )/ ( lately (p,2p) Months inv minimum value )
def Npp(inv,p):
df1=data.loc[:,inv+'1':inv+str(p)]
value1=np.nanmin(df1,axis = 1 )
df2=data.loc[:,inv+str(p):inv+str(2*p)]
value2=np.nanmin(df2,axis = 1 )
auto_value = value1/value2
return inv+'_npp'+str(p),auto_value
Let's define a function , Directly encapsulate all the above functions
# First, execute all the following functions
# Define functions that call two parameters in batches
def auto_var2(inv,p):
#global data_new
try:
columns_name,values=Num(inv,p)
data_new[columns_name]=values
except:
print("Num PARSE ERROR",inv,p)
try:
columns_name,values=Nmz(inv,p)
data_new[columns_name]=values
except:
print("Nmz PARSE ERROR",inv,p)
try:
columns_name,values=Evr(inv,p)
data_new[columns_name]=values
except:
print("Evr PARSE ERROR",inv,p)
try:
columns_name,values=Avg(inv,p)
data_new[columns_name]=values
except:
print("Avg PARSE ERROR",inv,p)
try:
columns_name,values=Tot(inv,p)
data_new[columns_name]=values
except:
print("Tot PARSE ERROR",inv,p)
try:
columns_name,values=Tot2T(inv,p)
data_new[columns_name]=values
except:
print("Tot2T PARSE ERROR",inv,p)
try:
columns_name,values=Max(inv,p)
data_new[columns_name]=values
except:
print("Max PARSE ERROR",inv,p)
try:
columns_name,values=Min(inv,p)
data_new[columns_name]=values
except:
print("Min PARSE ERROR",inv,p)
try:
columns_name,values=Msg(inv,p)
data_new[columns_name]=values
except:
print("Msg PARSE ERROR",inv,p)
try:
columns_name,values=Msz(inv,p)
data_new[columns_name]=values
except:
print("Msz PARSE ERROR",inv,p)
try:
columns_name,values=Cav(inv,p)
data_new[columns_name]=values
except:
print("Cav PARSE ERROR",inv,p)
try:
columns_name,values=Cmn(inv,p)
data_new[columns_name]=values
except:
print("Cmn PARSE ERROR",inv,p)
try:
columns_name,values=Mai(inv,p)
data_new[columns_name]=values
except:
print("Mai PARSE ERROR",inv,p)
try:
columns_name,values=Mad(inv,p)
data_new[columns_name]=values
except:
print("Mad PARSE ERROR",inv,p)
try:
columns_name,values=Std(inv,p)
data_new[columns_name]=values
except:
print("Std PARSE ERROR",inv,p)
try:
columns_name,values=Cva(inv,p)
data_new[columns_name]=values
except:
print("Cva PARSE ERROR",inv,p)
try:
columns_name,values=Cmm(inv,p)
data_new[columns_name]=values
except:
print("Cmm PARSE ERROR",inv,p)
try:
columns_name,values=Cnm(inv,p)
data_new[columns_name]=values
except:
print("Cnm PARSE ERROR",inv,p)
try:
columns_name,values=Cxm(inv,p)
data_new[columns_name]=values
except:
print("Cxm PARSE ERROR",inv,p)
try:
columns_name,values=Cxp(inv,p)
data_new[columns_name]=values
except:
print("Cxp PARSE ERROR",inv,p)
try:
columns_name,values=Ran(inv,p)
data_new[columns_name]=values
except:
print("Ran PARSE ERROR",inv,p)
try:
columns_name,values=Nci(inv,p)
data_new[columns_name]=values
except:
print("Nci PARSE ERROR",inv,p)
try:
columns_name,values=Ncd(inv,p)
data_new[columns_name]=values
except:
print("Ncd PARSE ERROR",inv,p)
try:
columns_name,values=Ncn(inv,p)
data_new[columns_name]=values
except:
print("Ncn PARSE ERROR",inv,p)
try:
columns_name,values=Bup(inv,p)
data_new[columns_name]=values
except:
print("Bup PARSE ERROR",inv,p)
try:
columns_name,values=Pdn(inv,p)
data_new[columns_name]=values
except:
print("Pdn PARSE ERROR",inv,p)
try:
columns_name,values=Trm(inv,p)
data_new[columns_name]=values
except:
print("Trm PARSE ERROR",inv,p)
try:
columns_name,values=Cmx(inv,p)
data_new[columns_name]=values
except:
print("Cmx PARSE ERROR",inv,p)
try:
columns_name,values=Cmp(inv,p)
data_new[columns_name]=values
except:
print("Cmp PARSE ERROR",inv,p)
try:
columns_name,values=Cnp(inv,p)
data_new[columns_name]=values
except:
print("Cnp PARSE ERROR",inv,p)
try:
columns_name,values=Msx(inv,p)
data_new[columns_name]=values
except:
print("Msx PARSE ERROR",inv,p)
try:
columns_name,values=Rpp(inv,p)
data_new[columns_name]=values
except:
print("Rpp PARSE ERROR",inv,p)
try:
columns_name,values=Dpp(inv,p)
data_new[columns_name]=values
except:
print("Dpp PARSE ERROR",inv,p)
try:
columns_name,values=Mpp(inv,p)
data_new[columns_name]=values
except:
print("Mpp PARSE ERROR",inv,p)
try:
columns_name,values=Npp(inv,p)
data_new[columns_name]=values
except:
print("Npp PARSE ERROR",inv,p)
return data_new.columns.size
Then we use a small demo Let's experiment
import pandas as pd
#data It is the original data set with features and labels
data = pd.read_excel('/Users/zhucan/Desktop/ Financial risk control practice / Lesson 3 materials /textdata.xlsx')
data
""" ft and gt Represents two variable names 1-12 Indicates correspondence 12 The corresponding value for each month of the month """
'''ft1 refer to The number of refuelling times calculated from the data within one month from the day of application '''
'''gt1 refer to The refueling amount calculated from the data within one month from the day of application '''
data.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
# 'ft9', 'ft10', 'ft11', 'ft12', 'TOB', 'gt1', 'gt2', 'gt3', 'gt4', 'gt5',
# 'gt6', 'gt7', 'gt8', 'gt9', 'gt10', 'gt11', 'gt12'],
# dtype='object')
data_new = data.copy()
p = 4
inv = 'ft'
auto_data = pd.DataFrame()
for p in range(1,13):
for inv in ['ft','gt']:
auto_var2(inv,p)
# Mai PARSE ERROR ft 1
# Mad PARSE ERROR ft 1
# Trm PARSE ERROR ft 1
# Mai PARSE ERROR gt 1
# Mad PARSE ERROR gt 1
# Trm PARSE ERROR gt 1
# Trm PARSE ERROR ft 2
# Trm PARSE ERROR gt 2
# Rpp PARSE ERROR ft 7
# Dpp PARSE ERROR ft 7
# Mpp PARSE ERROR ft 7
# Npp PARSE ERROR ft 7
# Rpp PARSE ERROR gt 7
# Dpp PARSE ERROR gt 7
# Mpp PARSE ERROR gt 7
# Npp PARSE ERROR gt 7
# Rpp PARSE ERROR ft 8
# Dpp PARSE ERROR ft 8
# Mpp PARSE ERROR ft 8
# Npp PARSE ERROR ft 8
# Rpp PARSE ERROR gt 8
# Dpp PARSE ERROR gt 8
# Mpp PARSE ERROR gt 8
# Npp PARSE ERROR gt 8
# Rpp PARSE ERROR ft 9
# Dpp PARSE ERROR ft 9
# Mpp PARSE ERROR ft 9
# Npp PARSE ERROR ft 9
# Rpp PARSE ERROR gt 9
# Dpp PARSE ERROR gt 9
# Mpp PARSE ERROR gt 9
# Npp PARSE ERROR gt 9
# Rpp PARSE ERROR ft 10
# Dpp PARSE ERROR ft 10
# Mpp PARSE ERROR ft 10
# Npp PARSE ERROR ft 10
# Rpp PARSE ERROR gt 10
# Dpp PARSE ERROR gt 10
# Mpp PARSE ERROR gt 10
# Npp PARSE ERROR gt 10
# Rpp PARSE ERROR ft 11
# Dpp PARSE ERROR ft 11
# Mpp PARSE ERROR ft 11
# Npp PARSE ERROR ft 11
# Rpp PARSE ERROR gt 11
# Dpp PARSE ERROR gt 11
# Mpp PARSE ERROR gt 11
# Npp PARSE ERROR gt 11
# Tot2T PARSE ERROR ft 12
# Rpp PARSE ERROR ft 12
# Dpp PARSE ERROR ft 12
# Mpp PARSE ERROR ft 12
# Npp PARSE ERROR ft 12
# Tot2T PARSE ERROR gt 12
# Rpp PARSE ERROR gt 12
# Dpp PARSE ERROR gt 12
# Mpp PARSE ERROR gt 12
# Npp PARSE ERROR gt 12
data_new.shape
#(5, 808)
data_new.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
# 'ft9',
# ...
# 'gt_nci12', 'gt_ncd12', 'gt_ncn12', 'gt_bup12', 'gt_pdn12', 'gt_trm12',
# 'gt_cmx12', 'gt_cmp12', 'gt_cnp12', 'gt_msx12'],
# dtype='object', length=808)
边栏推荐
- 在新线程中使用Handler
- 20220213-CTF MISC-a_ good_ Idea (use of stegsolve tool) -2017_ Dating_ in_ Singapore
- 高斯消元 AcWing 884. 高斯消元解异或线性方程组
- Winter messenger 2
- Interval problem acwing 906 Interval grouping
- 阿里新成员「瓴羊」正式亮相,由阿里副总裁朋新宇带队,集结多个核心部门技术团队
- Vant Weapp SwipeCell设置多个按钮
- Client use of Argo CD installation
- Leetcode backtracking method
- Chart. JS - Format Y axis - chart js - Formatting Y axis
猜你喜欢
4.Oracle-重做日志文件管理
将webApp或者H5页面打包成App
LeetCode-54
Game theory acwing 891 Nim games
MySQL怎么运行的系列(八)14张图说明白MySQL事务原子性和undo日志原理
Design specification for mobile folding screen
Client use of Argo CD installation
Knapsack problem acwing 9 Group knapsack problem
Package webapp or H5 pages into apps
博弈论 AcWing 893. 集合-Nim游戏
随机推荐
[BMZCTF-pwn] ectf-2014 seddit
Record of problems in ollvm compilation
Alibaba's new member "Lingyang" officially appeared, led by Peng Xinyu, Alibaba's vice president, and assembled a number of core department technical teams
在新线程中使用Handler
Redis-01.初识Redis
Genesis builds a new generation of credit system
1.手动创建Oracle数据库
Find the combination number acwing 887 Find combination number III
Single chip computer engineering experience - layered idea
5.Oracle-表空间
Day 2 document
Suppose a bank's ATM machine, which allows users to deposit and withdraw money. Now there is 200 yuan in an account, and both user a and user B have the right to deposit and withdraw money from this a
【高德地图POI踩坑】AMap.PlaceSearch无法使用
论文阅读报告
【LeetCode】Day95-有效的数独&矩阵置零
Simple selection sort of selection sort
Inclusion exclusion principle acwing 890 Divisible number
博弈论 AcWing 894. 拆分-Nim游戏
4. Oracle redo log file management
Game theory acwing 894 Split Nim game