当前位置:网站首页>2021 National Undergraduate data statistics and Analysis Competition
2021 National Undergraduate data statistics and Analysis Competition
2022-07-28 17:42:00 【Bubble Java】
This paper deals with the following contents
1. pandas Data processing 、 Screening 、 Calculation
2. Complex tabular data calculation and processing
3. Text analysis and unsupervised learning
4. Output the calculation results to the table
5. Comparison of data characteristics
Competition questions



Ideas













Code
take excel The text data in the file is transferred into txt file
# -*- coding: utf-8 -*-
"""
Created on Sun May 23 13:36:06 2021
@author: MYM
"""
import numpy as np
import pandas as pd
# read xlsx
data = pd.read_excel(' The attachment 1.xlsx')
text = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt",'w', encoding='GB2312',errors='ignore')
text_R = data[['R1','R2','R3']]
count = 0
for s in text_R['R1']:
if pd.isnull(s) or type(s) == int:
print('nan')
count = count + 1
else:
s = s.replace("
",',')
text.write(s)
text.write('
')
for s in text_R['R1']:
if pd.isnull(s) or type(s) == int:
print('nan')
count = count + 1
else:
s = s.replace("
",',')
text.write(s)
text.write('
')
for s in text_R['R1']:
if pd.isnull(s) or type(s) == int:
print('nan')
count = count + 1
else:
s = s.replace("
",',')
text.write(s)
text.write('
')
text.close()
# for i in range(1246):
# s = text_R.loc[i,'R1']
# if pd.isnull(s) or type(s) == int:
# print('nan')
# else:
# s = s.replace("
",',')
# text.write(s)
# text.write('
')
# for i in range(1246):
# s = text_R.loc[i,'R2']
# if pd.isnull(s) or type(s) == int:
# print('nan')
# else:
# s = s.replace("
",',')
# text.write(s)
# text.write('
')
# for i in range(1246):
# s = text_R.loc[i,'R3']
# if pd.isnull(s) or type(s) == int:
# print('nan')
# else:
# s = s.replace("
",',')
# text.write(s)
# text.write('
')
# text.close()
problem 1 The program
# -*- coding: utf-8 -*-
"""
Created on Sat May 22 14:48:36 2021
@author: MYM
"""
import pandas as pd
import numpy as np
def get_ave(df1, T_num):
data = df1.values
ave_data = data.sum(axis = 1) / T_num
return ave_data
# read xlsx
data = pd.read_excel(' The attachment 1.xlsx')
data_get = pd.read_excel(' The attachment 2.xlsx')
Num = len(data) # Number of samples
T_num = 3 # the number of teacher
percent = 5 # Screening At the end of 5%
# Get a specific column
X = data[['X1','X2','X3']]
Xk1 = data[['X11','X21','X31']]
Xk2 = data[['X12','X22','X32']]
Xk3 = data[['X13','X23','X33']]
Xk4 = data[['X14','X24','X34']]
X_ave = get_ave(X, T_num)
Xk1_ave = get_ave(Xk1, T_num)
Xk2_ave = get_ave(Xk2, T_num)
Xk3_ave = get_ave(Xk3, T_num)
Xk4_ave = get_ave(Xk4, T_num)
data_get[' Average score of topic selection and review '] = Xk1_ave
data_get[' Average score of innovation and paper value '] = Xk2_ave
data_get[' Average score of scientific research ability and basic knowledge '] = Xk3_ave
data_get[' The normative average score of the paper '] = Xk4_ave
data_get[' The average of the total score of the thesis '] = X_ave
X['ave'] = X_ave
X['Tag'] = data['Tag']
lose = []
for i in range(1,14):
if i == 6 or i == 11:
print('empty')
else:
Tag = X.loc[X['Tag'] == i]
percent_val = np.percentile(Tag['ave'], percent)
lose += list(Tag['ave'] < percent_val)
data_get[' Whether to eliminate '] = lose
data_get.to_excel('Pro_ The attachment 2.xlsx', index=None)
problem 2 The program
# -*- coding: utf-8 -*-
"""
Created on Sat May 22 16:37:22 2021
@author: MYM
"""
import pandas as pd
import numpy as np
def get_ave(df1, T_num):
data = df1.values
ave_data = data.sum(axis = 1) / T_num
return ave_data
# read xlsx
data = pd.read_excel(' The attachment 1.xlsx')
data_get = pd.read_excel(' The attachment 2.xlsx')
Num = len(data) # Number of samples
T_num = 3 # the number of teacher
# Get the total score column
X = data[['Tag','X1','X2','X3']]
X_ave = get_ave(X, T_num)
X['X_ave'] = X_ave
Sub_dict = dict()
for i in range(1,14):
Tag = X.loc[X['Tag'] == i]
Sub_dict.update({'Tag' + str(i):Tag})
Tag_std_dict = dict()
# The mean variance of the three total scores of each subject , And variance variance
Tag_std_mean = pd.DataFrame(index = ['mean','std'], columns = ['Tag1','Tag2','Tag3','Tag4','Tag5','Tag6','Tag7','Tag8','Tag9','Tag10','Tag11','Tag12','Tag13'])
for i in range(1,14):
Tag_val = Sub_dict.get('Tag' + str(i))[['X1','X2','X3']]
Tag_std = Tag_val.values.std(axis = 1)
Tag_std_dict.update({'Tag' + str(i):Tag_std})
Tag_std_mean.loc['mean','Tag'+str(i)] = Tag_std.mean()
Tag_std_mean.loc['std','Tag'+str(i)] = Tag_std.std()
Tag_std_mean.to_csv('Total scores.csv')
# Calculate the score of each item in each discipline , And the average level of the total score
# Get non comment columns
Tag_all = data.drop(columns = ['R1','R2','R3'])
Tag_all_dict = dict()
for i in range(1,14):
Tag_temp = Tag_all.loc[Tag_all['Tag'] == i]
Tag_all_dict.update({'Tag' + str(i):Tag_temp})
Tag_all_mean_dict = dict()
for i in range(1,14):
Tag_xk1 = Tag_all_dict.get('Tag' + str(i))[['X11','X21','X31']]
Tag_xk2 = Tag_all_dict.get('Tag' + str(i))[['X12','X22','X32']]
Tag_xk3 = Tag_all_dict.get('Tag' + str(i))[['X13','X23','X33']]
Tag_xk4 = Tag_all_dict.get('Tag' + str(i))[['X14','X24','X34']]
Tag_x = Tag_all_dict.get('Tag' + str(i))[['X1','X2','X3']]
df1 = pd.DataFrame(index = ['mean','std'], columns = ['Xk1','Xk2','Xk3','Xk4','X'])
df1['Xk1'] = [Tag_xk1.values.mean(), Tag_xk1.values.std()]
df1['Xk2'] = [Tag_xk2.values.mean(), Tag_xk2.values.std()]
df1['Xk3'] = [Tag_xk3.values.mean(), Tag_xk3.values.std()]
df1['Xk4'] = [Tag_xk4.values.mean(), Tag_xk4.values.std()]
df1['X'] = [Tag_x.values.mean(), Tag_x.values.std()]
Tag_all_mean_dict.update({'Tag'+str(i):df1}) # Mean and variance of sub items and total items of each discipline ( Do not distinguish between teachers )
for i in range(1,14):
if i == 6 or i == 11:
print('skip')
else:
ex = Tag_all_mean_dict.get('Tag' + str(i))
ex.to_csv('Tag' +str(i)+'.csv')
problem 3 The program
# -*- coding: utf-8 -*-
"""
Created on Sun May 23 10:47:17 2021
@author: MYM
"""
import numpy as np
import pandas as pd
import jieba
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
def get_custom_stopwords(stop_words_file):
with open(stop_words_file, encoding='utf-8')as f:
stopwords=f.read()
stopwords_list=stopwords.split('
')
custom_stopwords_list=[i for i in stopwords_list]
return custom_stopwords_list
# Load custom words
jieba.load_userdict("C:/Users/MYM/My_python_codes/DSA/user_dict.txt")
# Open file , The file is on the desktop , You can modify the path by yourself
f1 = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt","r",encoding='GB2312',errors='ignore')
f2 = open("C:/Users/MYM/My_python_codes/DSA/text_words_token.txt",'w',encoding='GB2312',errors='ignore')
for line in f1:
seg_list = jieba.cut(line, cut_all = False)
f2.write((" ".join(seg_list)).replace(" "," "))
#print(w)
f1.close()
f2.close()
# Take the content that needs participle
titles = open("C:/Users/MYM/My_python_codes/DSA/text_words_token.txt", encoding='GB2312', errors='ignore').read().split('
')
# View content , Here is a list, list Each element in it is a good title , Check the length to see if there is any error
# Stop word function call
stop_words_file= "C:/Users/MYM/My_python_codes/DSA/CNstopwords.txt"
stopwords = get_custom_stopwords(stop_words_file)
# Construct word vectors , That is, the divided times are transformed into kmeans Acceptable form
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(stop_words = stopwords)
km_matrix= count_vec.fit_transform(titles)
print(km_matrix.shape)
# Look at the word vector
# print(km_matrix.toarray())
# Start clustering
from sklearn.cluster import KMeans
num_clusters = 8 # Grouped into eight categories , It can be modified as needed
km = KMeans(n_clusters=num_clusters)
km.fit(km_matrix)
clusters = km.labels_.tolist()
# View the results of clustering , yes list, It is omitted here , See if the length is equal to title Just the same
#len(clusters)
# Finally, the clustering results are written in a new txt Inside
f3 =open("C:/Users/MYM/My_python_codes/DSA/cluster.txt", 'w',encoding='GB2312',errors='ignore')
for i in clusters:
f3.write(str(i))
f3.write("
")
f3.close()
# f1 = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt","r",encoding='GB2312',errors='ignore')
# f2 = open("C:/Users/MYM/My_python_codes/DSA/text_words_label.txt",'w',encoding='GB2312',errors='ignore')
# counts = 0
# for line in f1:
# f2.write(str(clusters[counts]))
# f2.write(' ')
# counts = counts + 1
# f2.write(line)
# f1.close()
# f2.close()
problem 3 Supplementary procedures for
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 10:07:12 2021
@author: MYM
"""
import numpy as np
import pandas as pd
data = pd.read_excel(' The attachment 1.xlsx')
# Read the clustering results
clusters = []
f1 = open("C:/Users/MYM/My_python_codes/DSA/cluster.txt", 'r',encoding='GB2312',errors='ignore')
for line in f1:
clusters.append(eval(line))
tag_dict = dict()
tag = (1,2,3,4,5,7,8,9,10,12,13)
for i in tag:
temp = pd.read_csv('Tag'+str(i)+'.csv')
tag_dict.update({'Tag'+str(i):temp})
num = list()
for i in range(8):
clusters_temp = [s == i for s in clusters]
num.append(sum(clusters_temp))
right = 0
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x1 = mean.loc[0,'Xk1']
mean_x23 = (mean.loc[0,'Xk2'] + mean.loc[0,'Xk3'])/2
mean_x4 = mean.loc[0,'Xk4']
s = data.loc[i,'R1']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if data.loc[i,'X11'] >= mean_x1:
if (data.loc[i,'X12'] + data.loc[i,'X13'])/2 >= mean_x23 :
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 7: # 111
right+=1
else:
if clusters[i] == 6: # 110
right+=1
else:
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 5:# 101
right+=1
else:
if clusters[i] == 0: # 100
right+=1
else:
if (data.loc[i,'X12'] + data.loc[i,'X13'])/2 >= mean_x23 :
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 4: # 011
right+=1
else:
if clusters[i] == 3: # 010
right+=1
else:
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 2:# 001
right+=1
else:
if clusters[i] == 1: # 000
right+=1
problem 4 The program
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 12:20:33 2021
@author: MYM
"""
import numpy as np
import pandas as pd
# Average score and standard deviation of each subject
tag_dict = dict()
tag = (1,2,3,4,5,7,8,9,10,12,13)
for i in tag:
temp = pd.read_csv('Tag' + str(i) + '.csv')
tag_dict.update({'Tag'+ str(i) : temp})
# Read the clustering results
clusters = []
f1 = open("C:/Users/MYM/My_python_codes/DSA/cluster_q4.txt", 'r',encoding='GB2312',errors='ignore')
for line in f1:
clusters.append(eval(line))
# Read the attachment 1
data = pd.read_excel(' The attachment 1.xlsx')
all_score = data[['X1','X2','X3']]
count = 0
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x = mean.loc[0,'X']
std_x = mean.loc[1,'X']
s = data.loc[i,'R1']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if clusters[count] == 0:
if all_score.loc[i,'X1'] <= mean_x:
all_score.loc[i,'X1'] = all_score.loc[i,'X1'] + std_x/2
else:
if all_score.loc[i,'X1'] >= mean_x:
all_score.loc[i,'X1'] = all_score.loc[i,'X1'] - std_x/2
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x = mean.loc[0,'X']
std_x = mean.loc[1,'X']
s = data.loc[i,'R2']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if clusters[count] == 0:
if all_score.loc[i,'X2'] <= mean_x:
all_score.loc[i,'X2'] = all_score.loc[i,'X2'] + std_x/2
else:
if all_score.loc[i,'X2'] >= mean_x:
all_score.loc[i,'X2'] = all_score.loc[i,'X2'] - std_x/2
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x = mean.loc[0,'X']
std_x = mean.loc[1,'X']
s = data.loc[i,'R3']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if clusters[count] == 0:
if all_score.loc[i,'X3'] <= mean_x:
all_score.loc[i,'X3'] = all_score.loc[i,'X3'] + std_x/2
else:
if all_score.loc[i,'X3'] >= mean_x:
all_score.loc[i,'X3'] = all_score.loc[i,'X3'] - std_x/2
f_score = all_score.sum(axis = 1)/3
f_data = pd.read_excel('Pro_ The attachment 2.xlsx')
f_data[' Comprehensive score '] = f_score
f_data.to_excel('Pro_ The attachment 2.xlsx')
problem 5 The program
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 14:31:00 2021
@author: MYM
"""
import numpy as np
import pandas as pd
# Read the attachment 2 data
data_get = pd.read_excel('Pro_ The attachment 2.xlsx')
# Extract and eliminate papers
lose_paper = data_get.loc[data_get[' Whether to eliminate '] == True]
# Extract excellent papers
percent = 90
percent_val = np.percentile(data_get[' Comprehensive score '], percent)
win_paper = data_get.loc[data_get[' Comprehensive score '] > percent_val]
lose_paper = lose_paper[lose_paper['Tag'] == 8]
win_paper = win_paper[win_paper['Tag'] == 8]
lose_paper_val = lose_paper[[' Average score of topic selection and review ',' Average score of innovation and paper value ',' Average score of scientific research ability and basic knowledge ',' The normative average score of the paper ',' The average of the total score of the thesis ',' Comprehensive score ']]
win_paper_val = win_paper[[' Average score of topic selection and review ',' Average score of innovation and paper value ',' Average score of scientific research ability and basic knowledge ',' The normative average score of the paper ',' The average of the total score of the thesis ',' Comprehensive score ']]
lose_mean = lose_paper_val.sum(axis = 0)/len(lose_paper_val)
lose_std = lose_paper_val.std(axis = 0)
win_mean = win_paper_val.sum(axis = 0)/len(win_paper_val)
win_std = win_paper_val.std(axis = 0)
print(lose_paper_val.sum(axis = 0)/len(lose_paper_val))
print(lose_paper_val.std(axis = 0))
print(win_paper_val.sum(axis = 0)/len(win_paper_val))
print(win_paper_val.std(axis = 0))
Code and paper address :https://github.com/xiaolingwei/DSA
Welcome to follow me github And csdn.
This article is original. , Reprint please indicate the source .
边栏推荐
猜你喜欢

面试官:算法刷题实录.pdf我居然答不上来

从非儿童网站看基线安全到底有多重要

In depth sharing of Ali (ant financial) technical interview process, with preliminary preparation and learning direction

谈谈“发布后问题”的度量

R语言画图/绘图/作图2

No interactive operation of shell script

Jdwp unauthorized rapid utilization

PCA reports error in eigen (crossprod (t (x), t (x)), symmetric = true): 'x' has infinite value or missing value
@RequestMapping详解

The easy-to-use special app testing tool itest4.7.0 has been released
随机推荐
医学公共数据库
R中因子(factor)
Arya-专业web自动化测试平台
The browser has no Internet, and wechat can connect to the Internet (solution)
MySQL implements sorting according to custom (specified order)
JS synchronizes the local time with the server time
Mmcv installation method
Precautions for $ionicpopup in ionic when calling alert for two consecutive times
软件测试的培训机构靠谱吗
Redis source code analysis, hold it hard, and code it quickly
MySQL面试题大全(陆续更新)
R语言 sub()用法
FreeRTOS learning notes
Backup and restore of SNAT and DNAT firewall rules
特殊质数js实现
ng-repeat在迭代最后一个元素时执行一个方法
转行学习软件测试有前途吗?
R language drawing / drawing / drawing 2
Convert the image file of input type='file'to Base64
分支与循环(for与do-while)