当前位置：网站首页>2021 National Undergraduate data statistics and Analysis Competition

2021 National Undergraduate data statistics and Analysis Competition

2022-07-28 17:42:00 【Bubble Java】

This paper deals with the following contents

1. pandas Data processing 、 Screening 、 Calculation
2. Complex tabular data calculation and processing
3. Text analysis and unsupervised learning
4. Output the calculation results to the table
5. Comparison of data characteristics

Competition questions

Insert picture description here

Ideas

Insert picture description here

Code

take excel The text data in the file is transferred into txt file

# -*- coding: utf-8 -*-
"""
Created on Sun May 23 13:36:06 2021

@author: MYM
"""

import numpy as np
import pandas as pd 

# read xlsx
data = pd.read_excel(' The attachment 1.xlsx')

text = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt",'w', encoding='GB2312',errors='ignore')

text_R = data[['R1','R2','R3']]





count = 0 
for s in text_R['R1']:
    if pd.isnull(s) or type(s) == int:
        print('nan')
        count = count + 1 
    else:
        s = s.replace("
",',')
        text.write(s)
        text.write('
')
for s in text_R['R1']:
    if pd.isnull(s) or type(s) == int:
        print('nan')
        count = count + 1 
    else:
        s = s.replace("
",',')
        text.write(s)
        text.write('
')
for s in text_R['R1']:
    if pd.isnull(s) or type(s) == int:
        print('nan')
        count = count + 1 
    else:
        s = s.replace("
",',')
        text.write(s)
        text.write('
')
text.close()



# for i in range(1246):
#     s = text_R.loc[i,'R1']
#     if pd.isnull(s) or type(s) == int:
#         print('nan')
#     else:
#         s = s.replace("
",',')
#         text.write(s)
#         text.write('
')
# for i in range(1246):
#     s = text_R.loc[i,'R2']
#     if pd.isnull(s) or type(s) == int:
#         print('nan')
#     else:
#         s = s.replace("
",',')
#         text.write(s)
#         text.write('
')
# for i in range(1246):
#     s = text_R.loc[i,'R3']
#     if pd.isnull(s) or type(s) == int:
#         print('nan')
#     else:
#         s = s.replace("
",',')
#         text.write(s)
#         text.write('
')
# text.close()

problem 1 The program

# -*- coding: utf-8 -*-
"""
Created on Sat May 22 14:48:36 2021

@author: MYM
"""

import pandas as pd
import numpy as np


def get_ave(df1, T_num):
    data = df1.values
    ave_data = data.sum(axis = 1) / T_num
    return ave_data


# read xlsx
data = pd.read_excel(' The attachment 1.xlsx')
data_get = pd.read_excel(' The attachment 2.xlsx')
Num = len(data) #  Number of samples 
T_num = 3 # the number of teacher
percent = 5 #   Screening   At the end of  5%

#  Get a specific column 
X = data[['X1','X2','X3']]
Xk1 = data[['X11','X21','X31']]
Xk2 = data[['X12','X22','X32']]
Xk3 = data[['X13','X23','X33']]
Xk4 = data[['X14','X24','X34']]


X_ave = get_ave(X, T_num)
Xk1_ave = get_ave(Xk1, T_num)
Xk2_ave = get_ave(Xk2, T_num)
Xk3_ave = get_ave(Xk3, T_num)
Xk4_ave = get_ave(Xk4, T_num)

data_get[' Average score of topic selection and review '] = Xk1_ave
data_get[' Average score of innovation and paper value '] = Xk2_ave
data_get[' Average score of scientific research ability and basic knowledge '] = Xk3_ave
data_get[' The normative average score of the paper '] = Xk4_ave
data_get[' The average of the total score of the thesis '] = X_ave


X['ave'] = X_ave
X['Tag'] = data['Tag']
lose = []
for i in range(1,14):
    if i == 6 or i == 11:
        print('empty')
    else:
        Tag = X.loc[X['Tag'] == i]    
        percent_val = np.percentile(Tag['ave'], percent)
        lose += list(Tag['ave'] < percent_val)


data_get[' Whether to eliminate '] = lose

data_get.to_excel('Pro_ The attachment 2.xlsx', index=None)

problem 2 The program

# -*- coding: utf-8 -*-
"""
Created on Sat May 22 16:37:22 2021

@author: MYM
"""

import pandas as pd
import numpy as np


def get_ave(df1, T_num):
    data = df1.values
    ave_data = data.sum(axis = 1) / T_num
    return ave_data



# read xlsx
data = pd.read_excel(' The attachment 1.xlsx')
data_get = pd.read_excel(' The attachment 2.xlsx')
Num = len(data) #  Number of samples 
T_num = 3 # the number of teacher

#  Get the total score column 
X = data[['Tag','X1','X2','X3']]
X_ave = get_ave(X, T_num)
X['X_ave'] = X_ave


Sub_dict = dict()
for i in range(1,14):
    Tag = X.loc[X['Tag'] == i]
    Sub_dict.update({'Tag' + str(i):Tag})
    
Tag_std_dict = dict()
#  The mean variance of the three total scores of each subject , And variance variance 
Tag_std_mean = pd.DataFrame(index = ['mean','std'], columns = ['Tag1','Tag2','Tag3','Tag4','Tag5','Tag6','Tag7','Tag8','Tag9','Tag10','Tag11','Tag12','Tag13']) 
for i in range(1,14):
    Tag_val = Sub_dict.get('Tag' + str(i))[['X1','X2','X3']]
    Tag_std = Tag_val.values.std(axis = 1)
    Tag_std_dict.update({'Tag' + str(i):Tag_std})
    Tag_std_mean.loc['mean','Tag'+str(i)] = Tag_std.mean()
    Tag_std_mean.loc['std','Tag'+str(i)] = Tag_std.std()

Tag_std_mean.to_csv('Total scores.csv')

#  Calculate the score of each item in each discipline , And the average level of the total score 

#  Get non comment columns 
Tag_all = data.drop(columns = ['R1','R2','R3'])
Tag_all_dict = dict()
for i in range(1,14):
    Tag_temp = Tag_all.loc[Tag_all['Tag'] == i]
    Tag_all_dict.update({'Tag' + str(i):Tag_temp})
    

Tag_all_mean_dict = dict()

for i in range(1,14):
    Tag_xk1 = Tag_all_dict.get('Tag' + str(i))[['X11','X21','X31']]
    Tag_xk2 = Tag_all_dict.get('Tag' + str(i))[['X12','X22','X32']]
    Tag_xk3 = Tag_all_dict.get('Tag' + str(i))[['X13','X23','X33']]
    Tag_xk4 = Tag_all_dict.get('Tag' + str(i))[['X14','X24','X34']]
    Tag_x = Tag_all_dict.get('Tag' + str(i))[['X1','X2','X3']]
    df1 = pd.DataFrame(index = ['mean','std'], columns = ['Xk1','Xk2','Xk3','Xk4','X'])
    df1['Xk1'] = [Tag_xk1.values.mean(), Tag_xk1.values.std()]
    df1['Xk2'] = [Tag_xk2.values.mean(), Tag_xk2.values.std()]
    df1['Xk3'] = [Tag_xk3.values.mean(), Tag_xk3.values.std()]
    df1['Xk4'] = [Tag_xk4.values.mean(), Tag_xk4.values.std()]
    df1['X'] = [Tag_x.values.mean(), Tag_x.values.std()]
    Tag_all_mean_dict.update({'Tag'+str(i):df1}) #  Mean and variance of sub items and total items of each discipline （ Do not distinguish between teachers ）
    
    
    
for i in range(1,14):
    if i == 6 or i == 11:
        print('skip')
    else:
        ex = Tag_all_mean_dict.get('Tag' + str(i))
        ex.to_csv('Tag' +str(i)+'.csv')

problem 3 The program

# -*- coding: utf-8 -*-
"""
Created on Sun May 23 10:47:17 2021

@author: MYM
"""

import numpy as np
import pandas as pd 
import jieba
import sklearn
from sklearn.feature_extraction.text import CountVectorizer



def get_custom_stopwords(stop_words_file):

    with open(stop_words_file, encoding='utf-8')as f: 
        
        stopwords=f.read()
        stopwords_list=stopwords.split('
')
        custom_stopwords_list=[i for i in stopwords_list]
    
    return custom_stopwords_list

# Load custom words  
jieba.load_userdict("C:/Users/MYM/My_python_codes/DSA/user_dict.txt")

# Open file , The file is on the desktop , You can modify the path by yourself 
f1 = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt","r",encoding='GB2312',errors='ignore')
f2 = open("C:/Users/MYM/My_python_codes/DSA/text_words_token.txt",'w',encoding='GB2312',errors='ignore')
for line in f1:

    seg_list = jieba.cut(line, cut_all = False)
    f2.write((" ".join(seg_list)).replace("			","	"))
    #print(w)
    
f1.close()
f2.close()

#  Take the content that needs participle 
titles = open("C:/Users/MYM/My_python_codes/DSA/text_words_token.txt", encoding='GB2312', errors='ignore').read().split('
')
# View content , Here is a list, list Each element in it is a good title , Check the length to see if there is any error 



# Stop word function call 
stop_words_file= "C:/Users/MYM/My_python_codes/DSA/CNstopwords.txt"
stopwords = get_custom_stopwords(stop_words_file)



# Construct word vectors , That is, the divided times are transformed into kmeans Acceptable form 
from sklearn.feature_extraction.text import CountVectorizer

count_vec=CountVectorizer(stop_words = stopwords)
km_matrix= count_vec.fit_transform(titles)
print(km_matrix.shape)

# Look at the word vector 
# print(km_matrix.toarray())

# Start clustering 
from sklearn.cluster import KMeans

num_clusters = 8 # Grouped into eight categories , It can be modified as needed 
km = KMeans(n_clusters=num_clusters)
km.fit(km_matrix)
clusters = km.labels_.tolist()

# View the results of clustering , yes list, It is omitted here , See if the length is equal to title Just the same 
#len(clusters)

# Finally, the clustering results are written in a new txt Inside 
f3 =open("C:/Users/MYM/My_python_codes/DSA/cluster.txt", 'w',encoding='GB2312',errors='ignore')

for i in clusters:
    f3.write(str(i))
    f3.write("
")
f3.close()

# f1 = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt","r",encoding='GB2312',errors='ignore')
# f2 = open("C:/Users/MYM/My_python_codes/DSA/text_words_label.txt",'w',encoding='GB2312',errors='ignore')

# counts = 0
# for line in f1:
#     f2.write(str(clusters[counts]))
#     f2.write(' ')
#     counts = counts + 1
#     f2.write(line)
    
# f1.close()
# f2.close()

problem 3 Supplementary procedures for

# -*- coding: utf-8 -*-
"""
Created on Wed May 26 10:07:12 2021

@author: MYM
"""

import numpy as np 
import pandas as pd


data = pd.read_excel(' The attachment 1.xlsx')
#  Read the clustering results 
clusters = []
f1 = open("C:/Users/MYM/My_python_codes/DSA/cluster.txt", 'r',encoding='GB2312',errors='ignore')
for line in f1:
    clusters.append(eval(line))

tag_dict = dict()
tag = (1,2,3,4,5,7,8,9,10,12,13)
for i in tag:
    temp = pd.read_csv('Tag'+str(i)+'.csv')
    tag_dict.update({'Tag'+str(i):temp})
num = list()
for i in range(8):
    clusters_temp = [s == i for s in clusters]
    num.append(sum(clusters_temp))

    
right = 0
for i in range(1246):
    Tag = data.loc[i,'Tag']
    mean = tag_dict.get('Tag'+ str(Tag))
    mean_x1 = mean.loc[0,'Xk1']
    mean_x23 = (mean.loc[0,'Xk2'] + mean.loc[0,'Xk3'])/2
    mean_x4 = mean.loc[0,'Xk4']
    s = data.loc[i,'R1'] 
    if pd.isnull(s) or type(s) == int:
        print('nan')
    else:
        if data.loc[i,'X11'] >= mean_x1:
            if (data.loc[i,'X12'] + data.loc[i,'X13'])/2 >= mean_x23 :
                if data.loc[i,'X14'] >= mean_x4:
                    if clusters[i] == 7: # 111 
                        right+=1
                else:
                    if clusters[i] == 6: # 110
                        right+=1
            else:
                if data.loc[i,'X14'] >= mean_x4:
                    if clusters[i] == 5:# 101
                        right+=1
                else:
                    if clusters[i] == 0: # 100
                        right+=1

        else:
            if (data.loc[i,'X12'] + data.loc[i,'X13'])/2 >= mean_x23 :
                if data.loc[i,'X14'] >= mean_x4:
                    if clusters[i] == 4: # 011
                        right+=1

                else:
                    if clusters[i] == 3: # 010
                        right+=1

            else:
                if data.loc[i,'X14'] >= mean_x4:
                    if clusters[i] == 2:# 001
                        right+=1

                else:
                    if clusters[i] == 1: # 000
                        right+=1

problem 4 The program

# -*- coding: utf-8 -*-
"""
Created on Wed May 26 12:20:33 2021

@author: MYM
"""

import numpy as np
import pandas as pd


#   Average score and standard deviation of each subject 
tag_dict = dict()
tag = (1,2,3,4,5,7,8,9,10,12,13)
for i in tag:
    temp = pd.read_csv('Tag' + str(i) + '.csv')
    tag_dict.update({'Tag'+ str(i) : temp})




#   Read the clustering results 
clusters = []
f1 = open("C:/Users/MYM/My_python_codes/DSA/cluster_q4.txt", 'r',encoding='GB2312',errors='ignore')
for line in f1:
    clusters.append(eval(line))
    


#  Read the attachment 1
data = pd.read_excel(' The attachment 1.xlsx')
all_score = data[['X1','X2','X3']]
count = 0
for i in range(1246):
    Tag = data.loc[i,'Tag']
    mean = tag_dict.get('Tag'+ str(Tag))
    mean_x = mean.loc[0,'X']
    std_x = mean.loc[1,'X']
    s = data.loc[i,'R1'] 
    if pd.isnull(s) or type(s) == int:
        print('nan')
    else:
        if clusters[count] == 0:
            if all_score.loc[i,'X1'] <= mean_x:
                all_score.loc[i,'X1'] = all_score.loc[i,'X1'] + std_x/2
        else:
            if all_score.loc[i,'X1'] >= mean_x:
                all_score.loc[i,'X1'] = all_score.loc[i,'X1'] - std_x/2
for i in range(1246):
    Tag = data.loc[i,'Tag']
    mean = tag_dict.get('Tag'+ str(Tag))
    mean_x = mean.loc[0,'X']
    std_x = mean.loc[1,'X']
    s = data.loc[i,'R2'] 
    if pd.isnull(s) or type(s) == int:
        print('nan')
    else:
        if clusters[count] == 0:
            if all_score.loc[i,'X2'] <= mean_x:
                all_score.loc[i,'X2'] = all_score.loc[i,'X2'] + std_x/2
        else:
            if all_score.loc[i,'X2'] >= mean_x:
                all_score.loc[i,'X2'] = all_score.loc[i,'X2'] - std_x/2
for i in range(1246):
    Tag = data.loc[i,'Tag']
    mean = tag_dict.get('Tag'+ str(Tag))
    mean_x = mean.loc[0,'X']
    std_x = mean.loc[1,'X']
    s = data.loc[i,'R3'] 
    if pd.isnull(s) or type(s) == int:
        print('nan')
    else:
        if clusters[count] == 0:
            if all_score.loc[i,'X3'] <= mean_x:
                all_score.loc[i,'X3'] = all_score.loc[i,'X3'] + std_x/2
        else:
            if all_score.loc[i,'X3'] >= mean_x:
                all_score.loc[i,'X3'] = all_score.loc[i,'X3'] - std_x/2       
f_score = all_score.sum(axis = 1)/3

f_data = pd.read_excel('Pro_ The attachment 2.xlsx')
f_data[' Comprehensive score '] = f_score
f_data.to_excel('Pro_ The attachment 2.xlsx')

problem 5 The program

# -*- coding: utf-8 -*-
"""
Created on Wed May 26 14:31:00 2021

@author: MYM
"""

import numpy as np 
import pandas as pd

#  Read the attachment 2 data 
data_get = pd.read_excel('Pro_ The attachment 2.xlsx')
#   Extract and eliminate papers 
lose_paper = data_get.loc[data_get[' Whether to eliminate '] == True]
# Extract excellent papers 
percent = 90
percent_val = np.percentile(data_get[' Comprehensive score '], percent)
win_paper = data_get.loc[data_get[' Comprehensive score '] > percent_val]


lose_paper = lose_paper[lose_paper['Tag'] == 8]
win_paper = win_paper[win_paper['Tag'] == 8]
lose_paper_val = lose_paper[[' Average score of topic selection and review ',' Average score of innovation and paper value ',' Average score of scientific research ability and basic knowledge ',' The normative average score of the paper ',' The average of the total score of the thesis ',' Comprehensive score ']]
win_paper_val = win_paper[[' Average score of topic selection and review ',' Average score of innovation and paper value ',' Average score of scientific research ability and basic knowledge ',' The normative average score of the paper ',' The average of the total score of the thesis ',' Comprehensive score ']]

lose_mean = lose_paper_val.sum(axis = 0)/len(lose_paper_val)
lose_std = lose_paper_val.std(axis = 0)

win_mean = win_paper_val.sum(axis = 0)/len(win_paper_val)
win_std = win_paper_val.std(axis = 0)



print(lose_paper_val.sum(axis = 0)/len(lose_paper_val))
print(lose_paper_val.std(axis = 0))
print(win_paper_val.sum(axis = 0)/len(win_paper_val))
print(win_paper_val.std(axis = 0))

Code and paper address ：https://github.com/xiaolingwei/DSA
Welcome to follow me github And csdn.

This article is original. , Reprint please indicate the source .

原网站

版权声明
本文为[Bubble Java]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/209/202207281625441321.html