当前位置:网站首页>决策树预测红酒品质
决策树预测红酒品质
2022-06-28 13:16:00 【刺猬宝宝马超】
%导入excel数据
filename="D:\Matlab\bin\作业\B题 附件1.xlsx";
Title=["非挥发性酸含量","挥发性酸含量","柠檬酸","糖含量","氯化物","游离二氧化硫","总二氧化硫","密度","PH值","硫酸盐","酒精"];
xlswrite(filename,Title);
[data,top,all]=xlsread(filename);
[data1,top1,all1]=xlsread(filename,'待预测数据集');
%------------------------------------------------------------------------------------------------------------------------------------
%数据集进行预处理
%rmmissing删除缺失值处理
list=rmmissing(data);
list1=rmmissing(data1);
%归一化z-score标准化处理
R = zscore(list);
%-------------------------------------------------------------------------------------------------------------------------------------
%随机产生训练集/测试集
P=randperm(3867);
Train =list(P(1:3000),:);
rest=list(P(3001:end),:);
%训练数据
P_Train=Train(:,1:11);
T_Train=Train(:,12);
%测试数据
P_rest=rest(:,1:11);
T_rest=rest(:,12);
%创建决策树分类器
ctree=ClassificationTree.fit(P_Train,T_Train);
%查看决策树视图
view(ctree);
view(ctree,'mode','graph');
%仿真测试
T_sim=predict(ctree,P_rest);
%V.结果分析
count_3= length(find(T_Train == 3));rate_3=count_3/3700;total_3=length(find(list(:,12)==3));
count_4= length(find(T_Train == 4));rate_4=count_4/3700;total_4=length(find(list(:,12)==4));
count_5= length(find(T_Train == 5));rate_5=count_5/3700;total_5=length(find(list(:,12)==5));
count_6= length(find(T_Train == 6));rate_6=count_6/3700;total_6=length(find(list(:,12)==6));
count_7= length(find(T_Train == 7));rate_7=count_7/3700;total_7=length(find(list(:,12)==7));
count_8= length(find(T_Train == 8));rate_8=count_8/3700;total_8=length(find(list(:,12)==8));
count_9= length(find(T_Train == 9));rate_9=count_9/3700;total_9=length(find(list(:,12)==9));
number_3= length(find(T_rest == 3));number_B3_sim= length(find(T_sim ==3&T_rest == 3));
number_4= length(find(T_rest == 4));number_B4_sim= length(find(T_sim ==4&T_rest == 4));
number_5= length(find(T_rest == 5));number_B5_sim= length(find(T_sim ==5&T_rest == 5));
number_6= length(find(T_rest == 6));number_B6_sim= length(find(T_sim ==6&T_rest == 6));
number_7= length(find(T_rest == 7));number_B7_sim= length(find(T_sim ==7&T_rest == 7));
number_8= length(find(T_rest == 8));number_B8_sim= length(find(T_sim ==8&T_rest == 8));
number_9= length(find(T_rest == 9));number_B9_sim= length(find(T_sim ==9&T_rest == 9));
fprintf('红酒测量总数:%d\n',3867);
fprintf('品质为3:%d\n',total_3);
fprintf('品质为4:%d\n',total_4);
fprintf('品质为5:%d\n',total_5);
fprintf('品质为6:%d\n',total_6);
fprintf('品质为7:%d\n',total_7);
fprintf('品质为8:%d\n',total_8);
fprintf('品质为9:%d\n',total_9);
fprintf('训练集红酒测量总数:%d\n',3700);
fprintf('品质为3:%d\n',count_3);
fprintf('品质为4:%d\n',count_4);
fprintf('品质为5:%d\n',count_5);
fprintf('品质为6:%d\n',count_6);
fprintf('品质为7:%d\n',count_7);
fprintf('品质为8:%d\n',count_8);
fprintf('品质为9:%d\n',count_9);
fprintf('测试集红酒测量总数:%d\n',167);
fprintf('品质为3:%d\n',number_3);
fprintf('品质为4:%d\n',number_4);
fprintf('品质为5:%d\n',number_5);
fprintf('品质为6:%d\n',number_6);
fprintf('品质为7:%d\n',number_7);
fprintf('品质为8:%d\n',number_8);
fprintf('品质为9:%d\n',number_9);
fprintf('品质预测正确数:%d\n',number_B3_sim+number_B4_sim+number_B5_sim+number_B6_sim+number_B7_sim+number_B8_sim+number_B9_sim);
fprintf('错误数%d\n',number_3+number_4+number_5+number_6+number_7+number_8+number_9-(number_B3_sim+number_B4_sim+number_B5_sim+number_B6_sim+number_B7_sim+number_B8_sim+number_B9_sim));
fprintf('准确率p:%f%%\n',(number_B3_sim+number_B4_sim+number_B5_sim+number_B6_sim+number_B7_sim+number_B8_sim+number_B9_sim)/(number_3+number_4+number_5+number_6+number_7+number_8+number_9)*100);
%.叶子节点含有的最小样本数对决策树性能的影响
leafs= logspace(1,2,10);
N= numel(leafs);
err= zeros(N,1);
for n= 1:N
t= ClassificationTree.fit(P_Train,T_Train,'crossval','on','minleaf',leafs(n));
err(n)= kfoldLoss(t);
end
plot(leafs,err);
xlabel('叶子节点含有的最小样本数');
ylabel('交叉验证误差');
title('叶子节点含有的最小样本数对决策树性能的影响');
%设置minleaf为10,产生优化决策树
OptimalTree=ClassificationTree.fit(P_Train,T_Train,'minleaf',10);
view(OptimalTree,'mode','graph');
%1.计算优化后决策树的重采样误差和交叉验证误差
resubOpt= resubLoss(OptimalTree);
lossOpt= kfoldLoss(crossval(OptimalTree));
%2.计算优化前决策树的重采样误差和交叉验证误差
resubDefault= resubLoss(ctree);
lossDefault= kfoldLoss(crossval(ctree));
%%.剪枝
[~,~,~,bestlevel]=cvLoss(ctree,'subtrees','all','treesize','min');
cptree= prune(ctree,'Level',bestlevel);
view(cptree,'mode','graph');
%1.计算剪枝后决策树的重采样误差和交叉验证误差
resubPrune= resubLoss(cptree);
lossPrune= kfoldLoss(crossval(cptree));
%------------------------------------------------------------------------------------------------------------------------------------
%表2数据
%随机产生训练集/测试集
P1=randperm(2000);
Train1 =list(P1(1:1000),:);
rest1=list1(1:1000,:);
%训练数据
P_Train1=Train1(:,1:11);
T_Train1=Train1(:,12);
%测试数据
P_rest1=rest1(:,1:11);
%创建决策树分类器
ctree1=ClassificationTree.fit(P_Train1,T_Train1);
%查看决策树视图
view(ctree1);
view(ctree1,'mode','graph');
%仿真测试
T_sim1=predict(ctree1,P_rest1);
%------------------------------------------------------------------------------------------------------------------------------------
%正向化处理步骤
[a,b]=size(list);
disp(['共有' num2str(a) '个评价对象, ' num2str(b-1) '个评价指标']);
position = input('请输入需要正向化处理的指标所在的列,例如[2,4,5]:');
disp('请输入需要处理的列指标类型(1:极小型,2:中间型, 3:区间型)')
type = input('例如第2列是极小型,第4列是区间型,第5列是中间型,输入[1,3,2]: ');
% 注意,Position和Type是两个同维度的行向量
X=list;
for i = 1 : size(position,2) %这里需要对这些列分别处理,因此我们需要知道一共要处理的次数,即循环的次数
X(:,position(i)) = Positivization(X(:,position(i)),type(i),position(i));
end
disp('正向化后的矩阵 X = ')
disp(X)
weigh=[0.15;0.15;0.1;0.2;0.05;0.05;0.05;0.05;0.01;0.01;0.18];%权重
R(:,12)=[];
r=R';
D_P = sum(((r(:,1:a)-repmat(max(r(:,1:a)),1,1)) .^2 ) .* repmat(weigh,1,a) ,2) .^ 0.5; % D+ 与最大值的距离向量
D_N = sum(((r(:,1:a)-repmat(min(r(:,1:a)),1,1)) .^2 ) .* repmat(weigh,1,a) ,2) .^ 0.5; % D- 与最小值的距离向量
S = D_N ./ (D_P+D_N); % 未归一化的得分
disp('最后的得分为:');
stand_S = S / sum(S);
[sorted_S,index] = sort(stand_S ,'descend');
disp(index);
引用函数Positivization.m
% function [输出变量] = 函数名称(输入变量)
function [posit_x] = Positivization(x,type,i)
% 输入变量有三个:
% x:需要正向化处理的指标对应的原始列向量
% type: 指标的类型(1:极小型, 2:中间型, 3:区间型)
% i: 正在处理的是原始矩阵中的哪一列
% 输出变量posit_x表示:正向化后的列向量
if type == 1 %极小型
disp(['第' num2str(i) '列是极小型,正在正向化'] )
posit_x = max(x) - x;
disp(['第' num2str(i) '列极小型正向化处理完成'] )
disp('--------------------分界线--------------------')
elseif type == 2 %中间型
disp(['第' num2str(i) '列是中间型'] )
best = input('请输入最佳的那一个值: ');
M = max(abs(best-x));
posit_x = 1-abs(best-x)/M;
disp(['第' num2str(i) '列中间型正向化处理完成'] )
disp('--------------------分界线--------------------')
elseif type == 3 %区间型
disp(['第' num2str(i) '列是区间型'] )
a = input('请输入区间的下界: ');
b = input('请输入区间的上界: ');
r_data = size(x,1);
M = max([a-min(x),max(x)-b]);
posit_x = zeros(r_data,1);
for i = 1:r_data
if x(i)<a
posit_x(i) = 1-(a-x(i))/M;
elseif x(i)>b
posit_x(i) = 1-(x(i)-b)/M;
else
posit_x(i) = 1;
end
end
disp(['第' num2str(i) '列区间型正向化处理完成'] )
disp('--------------------分界线--------------------')
else
disp('没有这种类型的指标,请检查Type向量中是否有除了1、2、3之外的其他值')
end
end
边栏推荐
- Class structure in C language - dot
- 投资98万美元的Saas项目失败了
- Complete backpack beginner chapter "suggestions collection"
- How to find opportunities in a bear market?
- MySQL多表联合查询
- The $980000 SaaS project failed
- Google Earth Engine(GEE)——联合国粮农组织全球有机土壤面积(1992-2018年度)
- 我呕血收集融合了来自各路经典shell书籍的脚本教学,作为小白的你快点来吧
- How to solve the data inconsistency between redis and MySQL?
- scratch旅行相册 电子学会图形化编程scratch等级考试一级真题和答案解析2022年6月
猜你喜欢

thinkphp6 多级控制器目录访问解决方法

Mysql database literacy, do you really know what a database is

Commonly used "redmine" for # test bug

Forecast and Analysis on market scale and development trend of China's operation and maintenance security products in 2022

How to set auto format after saving code in vscade

Hubble database x a joint-stock commercial bank: upgrade the number management system of Guanzi, so that every RMB has an "ID card"

Tencent has confirmed that QQ has stolen numbers on a large scale, iphone14 has no chance of type-C, and 5g, the fourth largest operator, has officially released numbers. Today, more big news is here

新品体验:阿里云新一代本地SSD实例i4开放公测

一文抄 10 篇!韩国发表的顶级会议论文被曝抄袭,第一作者是“原罪”?

An idea plug-in that automatically generates unit tests, which improves the development efficiency by more than 70%!
随机推荐
Stackoverflow 2022 database annual survey
为什么越来越多的用户放弃 Swagger,选择Apifox
How about stock online account opening and account opening process? Is it safe to open a mobile account?
华泰证券开户怎么开 怎么办理开户最安全
微服务稳定性保障
Solution to directory access of thinkphp6 multi-level controller
matlab plotyy 坐标轴设置,[转载]Matlab plotyy画双纵坐标图实例[通俗易懂]
php获取数字的个位数并替换为指定的尾数
词云的可视化设计教程
Resume template Baidu online disk
单元测试 CI/CD
Centos7: switch MySQL users and log in to MySQL
Commonly used "redmine" for # test bug
Hang Seng Electronics: lightdb, a financial distributed database, has passed a number of evaluations by China Academy of communications technology
我呕血收集融合了来自各路经典shell书籍的脚本教学,作为小白的你快点来吧
Centos7——安装mysql5.7
After failing in the college entrance examination, he entered Harbin Institute of technology, but stayed in the university after graduation to be an "Explorer". Ding Xiao: scientific research is accum
电子元器件分销10亿俱乐部[通俗易懂]
Flutter series part: detailed explanation of GridView layout commonly used in flutter
Centos7:切换mysql用户并登录mysql