当前位置:网站首页>2021 年全国大学生数据统计与分析竞赛
2021 年全国大学生数据统计与分析竞赛
2022-07-28 16:25:00 【泡泡Java】
本文涉及以下内容
1. pandas数据处理、筛选、计算
2. 复杂的表格数据计算与处理
3. 文本分析与无监督学习
4. 将计算结果输出表格
5. 数据特征对比
赛题



思路













代码
将excel文件中的文本数据转入txt文件
# -*- coding: utf-8 -*-
"""
Created on Sun May 23 13:36:06 2021
@author: MYM
"""
import numpy as np
import pandas as pd
# read xlsx
data = pd.read_excel('附件1.xlsx')
text = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt",'w', encoding='GB2312',errors='ignore')
text_R = data[['R1','R2','R3']]
count = 0
for s in text_R['R1']:
if pd.isnull(s) or type(s) == int:
print('nan')
count = count + 1
else:
s = s.replace("
",',')
text.write(s)
text.write('
')
for s in text_R['R1']:
if pd.isnull(s) or type(s) == int:
print('nan')
count = count + 1
else:
s = s.replace("
",',')
text.write(s)
text.write('
')
for s in text_R['R1']:
if pd.isnull(s) or type(s) == int:
print('nan')
count = count + 1
else:
s = s.replace("
",',')
text.write(s)
text.write('
')
text.close()
# for i in range(1246):
# s = text_R.loc[i,'R1']
# if pd.isnull(s) or type(s) == int:
# print('nan')
# else:
# s = s.replace("
",',')
# text.write(s)
# text.write('
')
# for i in range(1246):
# s = text_R.loc[i,'R2']
# if pd.isnull(s) or type(s) == int:
# print('nan')
# else:
# s = s.replace("
",',')
# text.write(s)
# text.write('
')
# for i in range(1246):
# s = text_R.loc[i,'R3']
# if pd.isnull(s) or type(s) == int:
# print('nan')
# else:
# s = s.replace("
",',')
# text.write(s)
# text.write('
')
# text.close()
问题1的程序
# -*- coding: utf-8 -*-
"""
Created on Sat May 22 14:48:36 2021
@author: MYM
"""
import pandas as pd
import numpy as np
def get_ave(df1, T_num):
data = df1.values
ave_data = data.sum(axis = 1) / T_num
return ave_data
# read xlsx
data = pd.read_excel('附件1.xlsx')
data_get = pd.read_excel('附件2.xlsx')
Num = len(data) # 样本数目
T_num = 3 # the number of teacher
percent = 5 # 筛选 末尾 5%
# 获取特定的列
X = data[['X1','X2','X3']]
Xk1 = data[['X11','X21','X31']]
Xk2 = data[['X12','X22','X32']]
Xk3 = data[['X13','X23','X33']]
Xk4 = data[['X14','X24','X34']]
X_ave = get_ave(X, T_num)
Xk1_ave = get_ave(Xk1, T_num)
Xk2_ave = get_ave(Xk2, T_num)
Xk3_ave = get_ave(Xk3, T_num)
Xk4_ave = get_ave(Xk4, T_num)
data_get['选题与综述平均分'] = Xk1_ave
data_get['创新性及论文价值平均分'] = Xk2_ave
data_get['科研能力与基础知识平均分'] = Xk3_ave
data_get['论文规范性平均分'] = Xk4_ave
data_get['论文总分平均分'] = X_ave
X['ave'] = X_ave
X['Tag'] = data['Tag']
lose = []
for i in range(1,14):
if i == 6 or i == 11:
print('empty')
else:
Tag = X.loc[X['Tag'] == i]
percent_val = np.percentile(Tag['ave'], percent)
lose += list(Tag['ave'] < percent_val)
data_get['是否淘汰'] = lose
data_get.to_excel('Pro_附件2.xlsx', index=None)
问题2的程序
# -*- coding: utf-8 -*-
"""
Created on Sat May 22 16:37:22 2021
@author: MYM
"""
import pandas as pd
import numpy as np
def get_ave(df1, T_num):
data = df1.values
ave_data = data.sum(axis = 1) / T_num
return ave_data
# read xlsx
data = pd.read_excel('附件1.xlsx')
data_get = pd.read_excel('附件2.xlsx')
Num = len(data) # 样本数目
T_num = 3 # the number of teacher
# 获取总分的列
X = data[['Tag','X1','X2','X3']]
X_ave = get_ave(X, T_num)
X['X_ave'] = X_ave
Sub_dict = dict()
for i in range(1,14):
Tag = X.loc[X['Tag'] == i]
Sub_dict.update({'Tag' + str(i):Tag})
Tag_std_dict = dict()
# 每个学科的三个总分的方差均值,与方差方差
Tag_std_mean = pd.DataFrame(index = ['mean','std'], columns = ['Tag1','Tag2','Tag3','Tag4','Tag5','Tag6','Tag7','Tag8','Tag9','Tag10','Tag11','Tag12','Tag13'])
for i in range(1,14):
Tag_val = Sub_dict.get('Tag' + str(i))[['X1','X2','X3']]
Tag_std = Tag_val.values.std(axis = 1)
Tag_std_dict.update({'Tag' + str(i):Tag_std})
Tag_std_mean.loc['mean','Tag'+str(i)] = Tag_std.mean()
Tag_std_mean.loc['std','Tag'+str(i)] = Tag_std.std()
Tag_std_mean.to_csv('Total scores.csv')
# 计算每个学科的各个项目的得分,与总分平均分水平
# 获取非评语列
Tag_all = data.drop(columns = ['R1','R2','R3'])
Tag_all_dict = dict()
for i in range(1,14):
Tag_temp = Tag_all.loc[Tag_all['Tag'] == i]
Tag_all_dict.update({'Tag' + str(i):Tag_temp})
Tag_all_mean_dict = dict()
for i in range(1,14):
Tag_xk1 = Tag_all_dict.get('Tag' + str(i))[['X11','X21','X31']]
Tag_xk2 = Tag_all_dict.get('Tag' + str(i))[['X12','X22','X32']]
Tag_xk3 = Tag_all_dict.get('Tag' + str(i))[['X13','X23','X33']]
Tag_xk4 = Tag_all_dict.get('Tag' + str(i))[['X14','X24','X34']]
Tag_x = Tag_all_dict.get('Tag' + str(i))[['X1','X2','X3']]
df1 = pd.DataFrame(index = ['mean','std'], columns = ['Xk1','Xk2','Xk3','Xk4','X'])
df1['Xk1'] = [Tag_xk1.values.mean(), Tag_xk1.values.std()]
df1['Xk2'] = [Tag_xk2.values.mean(), Tag_xk2.values.std()]
df1['Xk3'] = [Tag_xk3.values.mean(), Tag_xk3.values.std()]
df1['Xk4'] = [Tag_xk4.values.mean(), Tag_xk4.values.std()]
df1['X'] = [Tag_x.values.mean(), Tag_x.values.std()]
Tag_all_mean_dict.update({'Tag'+str(i):df1}) # 每个学科的分项与总项的均值与方差(不区分打分老师)
for i in range(1,14):
if i == 6 or i == 11:
print('skip')
else:
ex = Tag_all_mean_dict.get('Tag' + str(i))
ex.to_csv('Tag' +str(i)+'.csv')
问题3的程序
# -*- coding: utf-8 -*-
"""
Created on Sun May 23 10:47:17 2021
@author: MYM
"""
import numpy as np
import pandas as pd
import jieba
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
def get_custom_stopwords(stop_words_file):
with open(stop_words_file, encoding='utf-8')as f:
stopwords=f.read()
stopwords_list=stopwords.split('
')
custom_stopwords_list=[i for i in stopwords_list]
return custom_stopwords_list
#加载自定义词语
jieba.load_userdict("C:/Users/MYM/My_python_codes/DSA/user_dict.txt")
#打开文件,文件在桌面上,可以自行修改路径
f1 = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt","r",encoding='GB2312',errors='ignore')
f2 = open("C:/Users/MYM/My_python_codes/DSA/text_words_token.txt",'w',encoding='GB2312',errors='ignore')
for line in f1:
seg_list = jieba.cut(line, cut_all = False)
f2.write((" ".join(seg_list)).replace(" "," "))
#print(w)
f1.close()
f2.close()
# 取需要分词的内容
titles = open("C:/Users/MYM/My_python_codes/DSA/text_words_token.txt", encoding='GB2312', errors='ignore').read().split('
')
#查看内容,这里是一个list, list里面每个原素是分好的标题,查看下长度看有没有错误
#停用词函数调用
stop_words_file= "C:/Users/MYM/My_python_codes/DSA/CNstopwords.txt"
stopwords = get_custom_stopwords(stop_words_file)
#构建词向量,也就是把分好的次去除停词转化成kmeans可以接受的形式
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(stop_words = stopwords)
km_matrix= count_vec.fit_transform(titles)
print(km_matrix.shape)
#查看词向量
# print(km_matrix.toarray())
#开始聚类啦
from sklearn.cluster import KMeans
num_clusters = 8 #聚为八类,可根据需要修改
km = KMeans(n_clusters=num_clusters)
km.fit(km_matrix)
clusters = km.labels_.tolist()
#查看聚类的结果,是list,这里省略,看看长度是不是和title一样就行啦
#len(clusters)
#最后把聚类结果写在一个新的txt里面
f3 =open("C:/Users/MYM/My_python_codes/DSA/cluster.txt", 'w',encoding='GB2312',errors='ignore')
for i in clusters:
f3.write(str(i))
f3.write("
")
f3.close()
# f1 = open("C:/Users/MYM/My_python_codes/DSA/text_words.txt","r",encoding='GB2312',errors='ignore')
# f2 = open("C:/Users/MYM/My_python_codes/DSA/text_words_label.txt",'w',encoding='GB2312',errors='ignore')
# counts = 0
# for line in f1:
# f2.write(str(clusters[counts]))
# f2.write(' ')
# counts = counts + 1
# f2.write(line)
# f1.close()
# f2.close()
问题3的补充程序
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 10:07:12 2021
@author: MYM
"""
import numpy as np
import pandas as pd
data = pd.read_excel('附件1.xlsx')
# 读取聚类结果
clusters = []
f1 = open("C:/Users/MYM/My_python_codes/DSA/cluster.txt", 'r',encoding='GB2312',errors='ignore')
for line in f1:
clusters.append(eval(line))
tag_dict = dict()
tag = (1,2,3,4,5,7,8,9,10,12,13)
for i in tag:
temp = pd.read_csv('Tag'+str(i)+'.csv')
tag_dict.update({'Tag'+str(i):temp})
num = list()
for i in range(8):
clusters_temp = [s == i for s in clusters]
num.append(sum(clusters_temp))
right = 0
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x1 = mean.loc[0,'Xk1']
mean_x23 = (mean.loc[0,'Xk2'] + mean.loc[0,'Xk3'])/2
mean_x4 = mean.loc[0,'Xk4']
s = data.loc[i,'R1']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if data.loc[i,'X11'] >= mean_x1:
if (data.loc[i,'X12'] + data.loc[i,'X13'])/2 >= mean_x23 :
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 7: # 111
right+=1
else:
if clusters[i] == 6: # 110
right+=1
else:
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 5:# 101
right+=1
else:
if clusters[i] == 0: # 100
right+=1
else:
if (data.loc[i,'X12'] + data.loc[i,'X13'])/2 >= mean_x23 :
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 4: # 011
right+=1
else:
if clusters[i] == 3: # 010
right+=1
else:
if data.loc[i,'X14'] >= mean_x4:
if clusters[i] == 2:# 001
right+=1
else:
if clusters[i] == 1: # 000
right+=1
问题4的程序
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 12:20:33 2021
@author: MYM
"""
import numpy as np
import pandas as pd
# 每个学科的平均分与标准差
tag_dict = dict()
tag = (1,2,3,4,5,7,8,9,10,12,13)
for i in tag:
temp = pd.read_csv('Tag' + str(i) + '.csv')
tag_dict.update({'Tag'+ str(i) : temp})
# 读取聚类结果
clusters = []
f1 = open("C:/Users/MYM/My_python_codes/DSA/cluster_q4.txt", 'r',encoding='GB2312',errors='ignore')
for line in f1:
clusters.append(eval(line))
# 读取附件1
data = pd.read_excel('附件1.xlsx')
all_score = data[['X1','X2','X3']]
count = 0
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x = mean.loc[0,'X']
std_x = mean.loc[1,'X']
s = data.loc[i,'R1']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if clusters[count] == 0:
if all_score.loc[i,'X1'] <= mean_x:
all_score.loc[i,'X1'] = all_score.loc[i,'X1'] + std_x/2
else:
if all_score.loc[i,'X1'] >= mean_x:
all_score.loc[i,'X1'] = all_score.loc[i,'X1'] - std_x/2
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x = mean.loc[0,'X']
std_x = mean.loc[1,'X']
s = data.loc[i,'R2']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if clusters[count] == 0:
if all_score.loc[i,'X2'] <= mean_x:
all_score.loc[i,'X2'] = all_score.loc[i,'X2'] + std_x/2
else:
if all_score.loc[i,'X2'] >= mean_x:
all_score.loc[i,'X2'] = all_score.loc[i,'X2'] - std_x/2
for i in range(1246):
Tag = data.loc[i,'Tag']
mean = tag_dict.get('Tag'+ str(Tag))
mean_x = mean.loc[0,'X']
std_x = mean.loc[1,'X']
s = data.loc[i,'R3']
if pd.isnull(s) or type(s) == int:
print('nan')
else:
if clusters[count] == 0:
if all_score.loc[i,'X3'] <= mean_x:
all_score.loc[i,'X3'] = all_score.loc[i,'X3'] + std_x/2
else:
if all_score.loc[i,'X3'] >= mean_x:
all_score.loc[i,'X3'] = all_score.loc[i,'X3'] - std_x/2
f_score = all_score.sum(axis = 1)/3
f_data = pd.read_excel('Pro_附件2.xlsx')
f_data['综合得分'] = f_score
f_data.to_excel('Pro_附件2.xlsx')
问题5的程序
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 14:31:00 2021
@author: MYM
"""
import numpy as np
import pandas as pd
# 读取附件2数据
data_get = pd.read_excel('Pro_附件2.xlsx')
# 提取淘汰论文
lose_paper = data_get.loc[data_get['是否淘汰'] == True]
#提取优秀论文
percent = 90
percent_val = np.percentile(data_get['综合得分'], percent)
win_paper = data_get.loc[data_get['综合得分'] > percent_val]
lose_paper = lose_paper[lose_paper['Tag'] == 8]
win_paper = win_paper[win_paper['Tag'] == 8]
lose_paper_val = lose_paper[['选题与综述平均分','创新性及论文价值平均分','科研能力与基础知识平均分','论文规范性平均分','论文总分平均分','综合得分']]
win_paper_val = win_paper[['选题与综述平均分','创新性及论文价值平均分','科研能力与基础知识平均分','论文规范性平均分','论文总分平均分','综合得分']]
lose_mean = lose_paper_val.sum(axis = 0)/len(lose_paper_val)
lose_std = lose_paper_val.std(axis = 0)
win_mean = win_paper_val.sum(axis = 0)/len(win_paper_val)
win_std = win_paper_val.std(axis = 0)
print(lose_paper_val.sum(axis = 0)/len(lose_paper_val))
print(lose_paper_val.std(axis = 0))
print(win_paper_val.sum(axis = 0)/len(win_paper_val))
print(win_paper_val.std(axis = 0))
代码与论文地址:https://github.com/xiaolingwei/DSA
欢迎关注我的github与csdn。
本文原创,转载请注明出处。
边栏推荐
- Kubernetes service and ingress you need to master
- Redis源码剖析,狠狠地拿捏了,赶紧码住
- Verilog daily question (vl27 settable counter)
- MySQL PgSQL realizes the merging of multiple lines of records into one line, grouping and merging, and dividing with specified characters
- Modeling Semantics with Gated Graph Neural Networks for KBQA
- DGL Chapter 1 (official tutorial) personal notes
- 我为什么选择使用Go语言?
- AMQP协议详解
- Introduction to vscade interface
- influxdb2的使用
猜你喜欢

C# 导入Excel文件数据的几种方法

高速电路中电阻的选择

Verilog daily question (VL2 asynchronous reset Series T trigger - Niuke network)

Message Passing for Complex Question Answering over Knowledge Graphs

Verilog 每日一题(VL2 异步复位的串联T触发器--牛客网)

线性代数及矩阵论(七)

net框架

Goweb开发之Beego框架实战:第四节 数据库配置及连接

Goweb开发之Beego框架实战:第一节 Beego框架介绍

异步FIFO基本原理(基于Verilog的简单实现)
随机推荐
Verilog daily question (vl27 settable counter)
【presto】presto 常用的命令
High speed circuit design practice -- Overview
influxdb2的使用
配置V530交换机步骤
Verilog daily question (VL2 asynchronous reset Series T trigger - Niuke network)
Shell编程之Sed
Use Alibaba cloud's free SSL certificate
DGL Chapter 1 (official tutorial) personal notes
Net framework
Some attention code explanations
Reasoning Over Semantic-Level Graph for Fact Checking
C#基础面试题(附答案)
Use of influxdb2
【impala】【报错解决】 Impala cannot read or execute the parent directory of dfs.domain.socket.path的解决方法
Encountered.Sqlite file processing during Android Development
MySQL数据库增删改查(基础操作命令详解)
How to protect image security during construction
格雷码和二进制的转换及典型例题(4bits格雷码计数器)
Visual Studio 2012/2015发布Web应用连同.cs源码一起发布