当前位置:网站首页>Financial risk control practice -- feature derivation based on time series

Financial risk control practice -- feature derivation based on time series

2022-07-05 06:28:00 Grateful_ Dead424

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

I wrote it for you 35 A function , Let's look at it one by one

# lately p Months ,inv>0 Number of months 
def Num(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.where(df>0,1,0).sum(axis=1)
    return inv+'_num'+str(p),auto_value

# lately p Months ,inv=0 Number of months 
def Nmz(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.where(df==0,1,0).sum(axis=1)
    return inv+'_nmz'+str(p),auto_value

# lately p Months ,inv>0 Is the number of months >=1     
def Evr(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    arr=np.where(df>0,1,0).sum(axis=1)
    auto_value = np.where(arr,1,0)
    return inv+'_evr'+str(p),auto_value    

# lately p Months ,inv mean value 
def Avg(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmean(df,axis = 1 )
    return inv+'_avg'+str(p),auto_value    


# lately p Months ,inv and 
def Tot(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nansum(df,axis = 1)
    return inv+'_tot'+str(p),auto_value  

# lately (2,p+1) Months ,inv and 
def Tot2T(inv,p):
    df=data.loc[:,inv+'2':inv+str(p+1)]
    auto_value=df.sum(1)
    return inv+'_tot2t'+str(p),auto_value  


# lately p Months ,inv Maximum 
def Max(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmax(df,axis = 1)
    return inv+'_max'+str(p),auto_value 


# lately p Months ,inv minimum value 
def Min(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanmin(df,axis = 1)
    return inv+'_min'+str(p),auto_value 

# lately p Months , The last time inv>0 The number of months to now 

def Msg(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df_value=np.where(df>0,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        if row_value.max()<=0:
            indexs='0'
            auto_value.append(indexs)
        else:
            indexs=1
            for j in row_value:
                if j>0:
                    break
                indexs+=1
            auto_value.append(indexs)
    return inv+'_msg'+str(p),auto_value
 

# lately p Months , The last time inv=0 The number of months to now 
def Msz(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df_value=np.where(df==0,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        if row_value.max()<=0:
            indexs='0'
            auto_value.append(indexs)
        else:
            indexs=1
            for j in row_value:
                if j>0:
                    break
                indexs+=1
            auto_value.append(indexs)
    return inv+'_msz'+str(p),auto_value   
    
# During the month inv/( lately p Months inv The average of )
def Cav(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1']/np.nanmean(df,axis = 1 ) 
    return inv+'_cav'+str(p),auto_value 

# During the month inv/( lately p Months inv The minimum value of )
def Cmn(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1']/np.nanmin(df,axis = 1 ) 
    return inv+'_cmn'+str(p),auto_value 

# lately p Months , Every two months inv The maximum growth of 
def Mai(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)
        auto_value.append(np.nanmax(value_lst))     
    return inv+'_mai'+str(p),auto_value 

# lately p Months , Every two months inv The maximum reduction of 
def Mad(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])      
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k+1] - df_value[k]
            value_lst.append(minus)
        auto_value.append(np.nanmax(value_lst))     
    return inv+'_mad'+str(p),auto_value 

# lately p Months ,inv Standard deviation 
def Std(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanstd(df,axis = 1)
    return inv+'_std'+str(p),auto_value 

    
# lately p Months ,inv The coefficient of variation of 
def Cva(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value=np.nanstd(df,axis = 1 )/np.nanmean(df,axis = 1)
    return inv+'_cva'+str(p),auto_value 



#( During the month inv) - ( lately p Months inv The average of )
def Cmm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmean(df,axis = 1 ) 
    return inv+'_cmm'+str(p),auto_value 

#( During the month inv) - ( lately p Months inv The minimum value of )
def Cnm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmin(df,axis = 1 ) 
    return inv+'_cnm'+str(p),auto_value 


#( During the month inv) - ( lately p Months inv The maximum of )
def Cxm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = df[inv+'1'] - np.nanmax(df,axis = 1 ) 
    return inv+'_cxm'+str(p),auto_value 



#( ( During the month inv) - ( lately p Months inv The maximum of ) ) / ( lately p Months inv The maximum of ) )
def Cxp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    temp = np.nanmin(df,axis = 1 )
    auto_value = (df[inv+'1'] - temp )/ temp
    return inv+'_cxp'+str(p),auto_value 

# lately p Months ,inv It's very bad 
def Ran(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = np.nanmax(df,axis = 1 )  -  np.nanmin(df,axis = 1 ) 
    return inv+'_ran'+str(p),auto_value 


# lately min( Time on book,p ) Months , The number of months increased in the latter month compared with the previous month 
def Nci(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)>0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_nci'+str(p),auto_value 
   

# lately min( Time on book,p ) Months , The number of months decreased in the latter month compared with the previous month 
def Ncd(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)<0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_ncd'+str(p),auto_value 
           

# lately min( Time on book,p ) Months , Adjacent months inv  Equal number of months 
def Ncn(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        for k in range(len(df_value)-1):
            minus = df_value[k] - df_value[k+1]
            value_lst.append(minus)           
        value_ng = np.where(np.array(value_lst)==0,1,0).sum()
        auto_value.append(np.nanmax(value_ng))     
    return inv+'_ncn'+str(p),auto_value    
 
#If   lately min( Time on book,p ) Months , For any month i , There are  inv[i] > inv[i+1] ,
# Strictly increasing , And inv > 0 be flag = 1 Else flag = 0
def Bup(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        index = 0
        for k in range(len(df_value)-1):
            if df_value[k] > df_value[k+1]:
                break
            index =+ 1
        if index == p:            
            value= 1    
        else:
            value = 0
        auto_value.append(value)     
    return inv+'_bup'+str(p),auto_value   

#If   lately min( Time on book,p ) Months , For any month i , There are  inv[i] < inv[i+1] ,
# Strictly decreasing , And inv > 0 be flag = 1 Else flag = 0
def Pdn(inv,p):
    arr=np.array(data.loc[:,inv+'1':inv+str(p)])     
    auto_value = []
    for i in range(len(arr)):
        df_value = arr[i,:]
        value_lst = []
        index = 0
        for k in range(len(df_value)-1):
            if df_value[k+1] > df_value[k]:
                break
            index =+ 1
        if index == p:            
            value= 1    
        else:
            value = 0
        auto_value.append(value)     
    return inv+'_pdn'+str(p),auto_value            

# lately min( Time on book,p ) Months ,inv Pruning average of 
def Trm(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = []
    for i in range(len(df)):
        trm_mean = list(df.loc[i,:])
        trm_mean.remove(np.nanmax(trm_mean))
        trm_mean.remove(np.nanmin(trm_mean))
        temp=np.nanmean(trm_mean) 
        auto_value.append(temp)
    return inv+'_trm'+str(p),auto_value 

#(  During the month inv -  lately p Months inv Maximum  ) /  lately p Months inv Maximum of 
def Cmx(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmax(df,axis = 1 )) /np.nanmax(df,axis = 1 ) 
    return inv+'_cmx'+str(p),auto_value 

#(  During the month inv -  lately p Months inv mean value  ) / inv mean value 
def Cmp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmean(df,axis = 1 )) /np.nanmean(df,axis = 1 ) 
    return inv+'_cmp'+str(p),auto_value 


#(  During the month inv -  lately p Months inv minimum value  ) /inv minimum value  
def Cnp(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    auto_value = (df[inv+'1'] - np.nanmin(df,axis = 1 )) /np.nanmin(df,axis = 1 ) 
    return inv+'_cnp'+str(p),auto_value 


# lately min( Time on book,p ) The number of months from the current month with the maximum value of months 
def Msx(inv,p):
    df=data.loc[:,inv+'1':inv+str(p)]
    df['_max'] = np.nanmax(df,axis = 1)
    for i in range(1,p+1):
        df[inv+str(i)] = list(df[inv+str(i)] == df['_max'])
    del df['_max']
    df_value = np.where(df==True,1,0)
    auto_value=[]
    for i in range(len(df_value)):
        row_value=df_value[i,:]
        indexs=1
        for j in row_value:
            if j == 1:
                break
            indexs+=1
        auto_value.append(indexs)
    return inv+'_msx'+str(p),auto_value


# lately p The average of months /((p,2p) Months inv mean value )
def Rpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmean(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmean(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_rpp'+str(p),auto_value    


# lately p The average of months  - ((p,2p) Months inv mean value )
def Dpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmean(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmean(df2,axis = 1 )   
    auto_value = value1 - value2
    return inv+'_dpp'+str(p),auto_value   

#( lately p Months inv Maximum )/ ( lately (p,2p) Months inv Maximum )
def Mpp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmax(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmax(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_mpp'+str(p),auto_value  
  
#( lately p Months inv minimum value )/ ( lately (p,2p) Months inv minimum value )
def Npp(inv,p):
    df1=data.loc[:,inv+'1':inv+str(p)]
    value1=np.nanmin(df1,axis = 1 )
    df2=data.loc[:,inv+str(p):inv+str(2*p)]
    value2=np.nanmin(df2,axis = 1 )   
    auto_value = value1/value2
    return inv+'_npp'+str(p),auto_value  

Let's define a function , Directly encapsulate all the above functions

 # First, execute all the following functions  
       
# Define functions that call two parameters in batches         
def auto_var2(inv,p):
    #global data_new
    try:
        columns_name,values=Num(inv,p)
        data_new[columns_name]=values
    except:
           print("Num PARSE ERROR",inv,p)
    try:
        columns_name,values=Nmz(inv,p)
        data_new[columns_name]=values
    except:
           print("Nmz PARSE ERROR",inv,p)
    try:
        columns_name,values=Evr(inv,p)
        data_new[columns_name]=values
    except:
           print("Evr PARSE ERROR",inv,p)
    try:
        columns_name,values=Avg(inv,p)
        data_new[columns_name]=values
    except:
           print("Avg PARSE ERROR",inv,p)
    try:
        columns_name,values=Tot(inv,p)
        data_new[columns_name]=values
    except:
        print("Tot PARSE ERROR",inv,p) 
    try:
        columns_name,values=Tot2T(inv,p)
        data_new[columns_name]=values
    except:
        print("Tot2T PARSE ERROR",inv,p)        
    try:
        columns_name,values=Max(inv,p)
        data_new[columns_name]=values
    except:
        print("Max PARSE ERROR",inv,p)
    try:
        columns_name,values=Min(inv,p)
        data_new[columns_name]=values
    except:
        print("Min PARSE ERROR",inv,p)
    try:
        columns_name,values=Msg(inv,p)
        data_new[columns_name]=values
    except:
        print("Msg PARSE ERROR",inv,p)
    try:
        columns_name,values=Msz(inv,p)
        data_new[columns_name]=values
    except:
        print("Msz PARSE ERROR",inv,p)
    try:
        columns_name,values=Cav(inv,p)
        data_new[columns_name]=values
    except:
        print("Cav PARSE ERROR",inv,p)
    try:
        columns_name,values=Cmn(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmn PARSE ERROR",inv,p)           
    try:
        columns_name,values=Mai(inv,p)
        data_new[columns_name]=values
    except:
        print("Mai PARSE ERROR",inv,p)    
    try:
        columns_name,values=Mad(inv,p)
        data_new[columns_name]=values
    except:
        print("Mad PARSE ERROR",inv,p)
    try:
        columns_name,values=Std(inv,p)
        data_new[columns_name]=values
    except:
        print("Std PARSE ERROR",inv,p)   
    try:
        columns_name,values=Cva(inv,p)
        data_new[columns_name]=values
    except:
        print("Cva PARSE ERROR",inv,p)   
    try:
        columns_name,values=Cmm(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmm PARSE ERROR",inv,p)  
    try:
        columns_name,values=Cnm(inv,p)
        data_new[columns_name]=values
    except:
        print("Cnm PARSE ERROR",inv,p)         
    try:
        columns_name,values=Cxm(inv,p)
        data_new[columns_name]=values
    except:
        print("Cxm PARSE ERROR",inv,p)          
    try:
        columns_name,values=Cxp(inv,p)
        data_new[columns_name]=values
    except:
        print("Cxp PARSE ERROR",inv,p)
    try:
        columns_name,values=Ran(inv,p)
        data_new[columns_name]=values
    except:
        print("Ran PARSE ERROR",inv,p)
    try:
        columns_name,values=Nci(inv,p)
        data_new[columns_name]=values
    except:
        print("Nci PARSE ERROR",inv,p)
    try:
        columns_name,values=Ncd(inv,p)
        data_new[columns_name]=values
    except:
        print("Ncd PARSE ERROR",inv,p)
    try:
        columns_name,values=Ncn(inv,p)
        data_new[columns_name]=values
    except:
        print("Ncn PARSE ERROR",inv,p)
    try:
        columns_name,values=Bup(inv,p)
        data_new[columns_name]=values
    except:
        print("Bup PARSE ERROR",inv,p)
    try:
        columns_name,values=Pdn(inv,p)
        data_new[columns_name]=values
    except:
        print("Pdn PARSE ERROR",inv,p) 
    try:
        columns_name,values=Trm(inv,p)
        data_new[columns_name]=values
    except:
        print("Trm PARSE ERROR",inv,p)  
    try:
        columns_name,values=Cmx(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmx PARSE ERROR",inv,p)         
    try:
        columns_name,values=Cmp(inv,p)
        data_new[columns_name]=values
    except:
        print("Cmp PARSE ERROR",inv,p)   
    try:
        columns_name,values=Cnp(inv,p)
        data_new[columns_name]=values
    except:
        print("Cnp PARSE ERROR",inv,p) 
    try:
        columns_name,values=Msx(inv,p)
        data_new[columns_name]=values
    except:
        print("Msx PARSE ERROR",inv,p)
    try:
        columns_name,values=Rpp(inv,p)
        data_new[columns_name]=values
    except:
        print("Rpp PARSE ERROR",inv,p)
    try:
        columns_name,values=Dpp(inv,p)
        data_new[columns_name]=values
    except:
        print("Dpp PARSE ERROR",inv,p)
    try:
        columns_name,values=Mpp(inv,p)
        data_new[columns_name]=values
    except:
        print("Mpp PARSE ERROR",inv,p)
    try:
        columns_name,values=Npp(inv,p)
        data_new[columns_name]=values
    except:
        print("Npp PARSE ERROR",inv,p)
    return data_new.columns.size

Then we use a small demo Let's experiment

import pandas as pd
#data It is the original data set with features and labels 
data = pd.read_excel('/Users/zhucan/Desktop/ Financial risk control practice / Lesson 3 materials /textdata.xlsx')
data
""" ft  and  gt  Represents two variable names  1-12  Indicates correspondence 12 The corresponding value for each month of the month  """
'''ft1  refer to   The number of refuelling times calculated from the data within one month from the day of application '''
'''gt1  refer to   The refueling amount calculated from the data within one month from the day of application '''

data.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
#       'ft9', 'ft10', 'ft11', 'ft12', 'TOB', 'gt1', 'gt2', 'gt3', 'gt4', 'gt5',
#       'gt6', 'gt7', 'gt8', 'gt9', 'gt10', 'gt11', 'gt12'],
#      dtype='object')
data_new = data.copy()
p = 4
inv = 'ft'
auto_data = pd.DataFrame()
for p in range(1,13):
    for inv in ['ft','gt']:
        auto_var2(inv,p)
# Mai PARSE ERROR ft 1
# Mad PARSE ERROR ft 1
# Trm PARSE ERROR ft 1
# Mai PARSE ERROR gt 1
# Mad PARSE ERROR gt 1
# Trm PARSE ERROR gt 1
# Trm PARSE ERROR ft 2
# Trm PARSE ERROR gt 2
# Rpp PARSE ERROR ft 7
# Dpp PARSE ERROR ft 7
# Mpp PARSE ERROR ft 7
# Npp PARSE ERROR ft 7
# Rpp PARSE ERROR gt 7
# Dpp PARSE ERROR gt 7
# Mpp PARSE ERROR gt 7
# Npp PARSE ERROR gt 7
# Rpp PARSE ERROR ft 8
# Dpp PARSE ERROR ft 8
# Mpp PARSE ERROR ft 8
# Npp PARSE ERROR ft 8
# Rpp PARSE ERROR gt 8
# Dpp PARSE ERROR gt 8
# Mpp PARSE ERROR gt 8
# Npp PARSE ERROR gt 8
# Rpp PARSE ERROR ft 9
# Dpp PARSE ERROR ft 9
# Mpp PARSE ERROR ft 9
# Npp PARSE ERROR ft 9
# Rpp PARSE ERROR gt 9
# Dpp PARSE ERROR gt 9
# Mpp PARSE ERROR gt 9
# Npp PARSE ERROR gt 9
# Rpp PARSE ERROR ft 10
# Dpp PARSE ERROR ft 10
# Mpp PARSE ERROR ft 10
# Npp PARSE ERROR ft 10
# Rpp PARSE ERROR gt 10
# Dpp PARSE ERROR gt 10
# Mpp PARSE ERROR gt 10
# Npp PARSE ERROR gt 10
# Rpp PARSE ERROR ft 11
# Dpp PARSE ERROR ft 11
# Mpp PARSE ERROR ft 11
# Npp PARSE ERROR ft 11
# Rpp PARSE ERROR gt 11
# Dpp PARSE ERROR gt 11
# Mpp PARSE ERROR gt 11
# Npp PARSE ERROR gt 11
# Tot2T PARSE ERROR ft 12
# Rpp PARSE ERROR ft 12
# Dpp PARSE ERROR ft 12
# Mpp PARSE ERROR ft 12
# Npp PARSE ERROR ft 12
# Tot2T PARSE ERROR gt 12
# Rpp PARSE ERROR gt 12
# Dpp PARSE ERROR gt 12
# Mpp PARSE ERROR gt 12
# Npp PARSE ERROR gt 12
data_new.shape
#(5, 808)
data_new.columns
#Index(['customer_id', 'ft1', 'ft2', 'ft3', 'ft4', 'ft5', 'ft6', 'ft7', 'ft8',
#       'ft9',
#       ...
#       'gt_nci12', 'gt_ncd12', 'gt_ncn12', 'gt_bup12', 'gt_pdn12', 'gt_trm12',
#       'gt_cmx12', 'gt_cmp12', 'gt_cnp12', 'gt_msx12'],
#      dtype='object', length=808)
原网站

版权声明
本文为[Grateful_ Dead424]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/02/202202140605493868.html