当前位置:网站首页>决策树预测红酒品质
决策树预测红酒品质
2022-06-28 13:16:00 【刺猬宝宝马超】
%导入excel数据
filename="D:\Matlab\bin\作业\B题 附件1.xlsx";
Title=["非挥发性酸含量","挥发性酸含量","柠檬酸","糖含量","氯化物","游离二氧化硫","总二氧化硫","密度","PH值","硫酸盐","酒精"];
xlswrite(filename,Title);
[data,top,all]=xlsread(filename);
[data1,top1,all1]=xlsread(filename,'待预测数据集');
%------------------------------------------------------------------------------------------------------------------------------------
%数据集进行预处理
%rmmissing删除缺失值处理
list=rmmissing(data);
list1=rmmissing(data1);
%归一化z-score标准化处理
R = zscore(list);
%-------------------------------------------------------------------------------------------------------------------------------------
%随机产生训练集/测试集
P=randperm(3867);
Train =list(P(1:3000),:);
rest=list(P(3001:end),:);
%训练数据
P_Train=Train(:,1:11);
T_Train=Train(:,12);
%测试数据
P_rest=rest(:,1:11);
T_rest=rest(:,12);
%创建决策树分类器
ctree=ClassificationTree.fit(P_Train,T_Train);
%查看决策树视图
view(ctree);
view(ctree,'mode','graph');
%仿真测试
T_sim=predict(ctree,P_rest);
%V.结果分析
count_3= length(find(T_Train == 3));rate_3=count_3/3700;total_3=length(find(list(:,12)==3));
count_4= length(find(T_Train == 4));rate_4=count_4/3700;total_4=length(find(list(:,12)==4));
count_5= length(find(T_Train == 5));rate_5=count_5/3700;total_5=length(find(list(:,12)==5));
count_6= length(find(T_Train == 6));rate_6=count_6/3700;total_6=length(find(list(:,12)==6));
count_7= length(find(T_Train == 7));rate_7=count_7/3700;total_7=length(find(list(:,12)==7));
count_8= length(find(T_Train == 8));rate_8=count_8/3700;total_8=length(find(list(:,12)==8));
count_9= length(find(T_Train == 9));rate_9=count_9/3700;total_9=length(find(list(:,12)==9));
number_3= length(find(T_rest == 3));number_B3_sim= length(find(T_sim ==3&T_rest == 3));
number_4= length(find(T_rest == 4));number_B4_sim= length(find(T_sim ==4&T_rest == 4));
number_5= length(find(T_rest == 5));number_B5_sim= length(find(T_sim ==5&T_rest == 5));
number_6= length(find(T_rest == 6));number_B6_sim= length(find(T_sim ==6&T_rest == 6));
number_7= length(find(T_rest == 7));number_B7_sim= length(find(T_sim ==7&T_rest == 7));
number_8= length(find(T_rest == 8));number_B8_sim= length(find(T_sim ==8&T_rest == 8));
number_9= length(find(T_rest == 9));number_B9_sim= length(find(T_sim ==9&T_rest == 9));
fprintf('红酒测量总数:%d\n',3867);
fprintf('品质为3:%d\n',total_3);
fprintf('品质为4:%d\n',total_4);
fprintf('品质为5:%d\n',total_5);
fprintf('品质为6:%d\n',total_6);
fprintf('品质为7:%d\n',total_7);
fprintf('品质为8:%d\n',total_8);
fprintf('品质为9:%d\n',total_9);
fprintf('训练集红酒测量总数:%d\n',3700);
fprintf('品质为3:%d\n',count_3);
fprintf('品质为4:%d\n',count_4);
fprintf('品质为5:%d\n',count_5);
fprintf('品质为6:%d\n',count_6);
fprintf('品质为7:%d\n',count_7);
fprintf('品质为8:%d\n',count_8);
fprintf('品质为9:%d\n',count_9);
fprintf('测试集红酒测量总数:%d\n',167);
fprintf('品质为3:%d\n',number_3);
fprintf('品质为4:%d\n',number_4);
fprintf('品质为5:%d\n',number_5);
fprintf('品质为6:%d\n',number_6);
fprintf('品质为7:%d\n',number_7);
fprintf('品质为8:%d\n',number_8);
fprintf('品质为9:%d\n',number_9);
fprintf('品质预测正确数:%d\n',number_B3_sim+number_B4_sim+number_B5_sim+number_B6_sim+number_B7_sim+number_B8_sim+number_B9_sim);
fprintf('错误数%d\n',number_3+number_4+number_5+number_6+number_7+number_8+number_9-(number_B3_sim+number_B4_sim+number_B5_sim+number_B6_sim+number_B7_sim+number_B8_sim+number_B9_sim));
fprintf('准确率p:%f%%\n',(number_B3_sim+number_B4_sim+number_B5_sim+number_B6_sim+number_B7_sim+number_B8_sim+number_B9_sim)/(number_3+number_4+number_5+number_6+number_7+number_8+number_9)*100);
%.叶子节点含有的最小样本数对决策树性能的影响
leafs= logspace(1,2,10);
N= numel(leafs);
err= zeros(N,1);
for n= 1:N
t= ClassificationTree.fit(P_Train,T_Train,'crossval','on','minleaf',leafs(n));
err(n)= kfoldLoss(t);
end
plot(leafs,err);
xlabel('叶子节点含有的最小样本数');
ylabel('交叉验证误差');
title('叶子节点含有的最小样本数对决策树性能的影响');
%设置minleaf为10,产生优化决策树
OptimalTree=ClassificationTree.fit(P_Train,T_Train,'minleaf',10);
view(OptimalTree,'mode','graph');
%1.计算优化后决策树的重采样误差和交叉验证误差
resubOpt= resubLoss(OptimalTree);
lossOpt= kfoldLoss(crossval(OptimalTree));
%2.计算优化前决策树的重采样误差和交叉验证误差
resubDefault= resubLoss(ctree);
lossDefault= kfoldLoss(crossval(ctree));
%%.剪枝
[~,~,~,bestlevel]=cvLoss(ctree,'subtrees','all','treesize','min');
cptree= prune(ctree,'Level',bestlevel);
view(cptree,'mode','graph');
%1.计算剪枝后决策树的重采样误差和交叉验证误差
resubPrune= resubLoss(cptree);
lossPrune= kfoldLoss(crossval(cptree));
%------------------------------------------------------------------------------------------------------------------------------------
%表2数据
%随机产生训练集/测试集
P1=randperm(2000);
Train1 =list(P1(1:1000),:);
rest1=list1(1:1000,:);
%训练数据
P_Train1=Train1(:,1:11);
T_Train1=Train1(:,12);
%测试数据
P_rest1=rest1(:,1:11);
%创建决策树分类器
ctree1=ClassificationTree.fit(P_Train1,T_Train1);
%查看决策树视图
view(ctree1);
view(ctree1,'mode','graph');
%仿真测试
T_sim1=predict(ctree1,P_rest1);
%------------------------------------------------------------------------------------------------------------------------------------
%正向化处理步骤
[a,b]=size(list);
disp(['共有' num2str(a) '个评价对象, ' num2str(b-1) '个评价指标']);
position = input('请输入需要正向化处理的指标所在的列,例如[2,4,5]:');
disp('请输入需要处理的列指标类型(1:极小型,2:中间型, 3:区间型)')
type = input('例如第2列是极小型,第4列是区间型,第5列是中间型,输入[1,3,2]: ');
% 注意,Position和Type是两个同维度的行向量
X=list;
for i = 1 : size(position,2) %这里需要对这些列分别处理,因此我们需要知道一共要处理的次数,即循环的次数
X(:,position(i)) = Positivization(X(:,position(i)),type(i),position(i));
end
disp('正向化后的矩阵 X = ')
disp(X)
weigh=[0.15;0.15;0.1;0.2;0.05;0.05;0.05;0.05;0.01;0.01;0.18];%权重
R(:,12)=[];
r=R';
D_P = sum(((r(:,1:a)-repmat(max(r(:,1:a)),1,1)) .^2 ) .* repmat(weigh,1,a) ,2) .^ 0.5; % D+ 与最大值的距离向量
D_N = sum(((r(:,1:a)-repmat(min(r(:,1:a)),1,1)) .^2 ) .* repmat(weigh,1,a) ,2) .^ 0.5; % D- 与最小值的距离向量
S = D_N ./ (D_P+D_N); % 未归一化的得分
disp('最后的得分为:');
stand_S = S / sum(S);
[sorted_S,index] = sort(stand_S ,'descend');
disp(index);
引用函数Positivization.m
% function [输出变量] = 函数名称(输入变量)
function [posit_x] = Positivization(x,type,i)
% 输入变量有三个:
% x:需要正向化处理的指标对应的原始列向量
% type: 指标的类型(1:极小型, 2:中间型, 3:区间型)
% i: 正在处理的是原始矩阵中的哪一列
% 输出变量posit_x表示:正向化后的列向量
if type == 1 %极小型
disp(['第' num2str(i) '列是极小型,正在正向化'] )
posit_x = max(x) - x;
disp(['第' num2str(i) '列极小型正向化处理完成'] )
disp('--------------------分界线--------------------')
elseif type == 2 %中间型
disp(['第' num2str(i) '列是中间型'] )
best = input('请输入最佳的那一个值: ');
M = max(abs(best-x));
posit_x = 1-abs(best-x)/M;
disp(['第' num2str(i) '列中间型正向化处理完成'] )
disp('--------------------分界线--------------------')
elseif type == 3 %区间型
disp(['第' num2str(i) '列是区间型'] )
a = input('请输入区间的下界: ');
b = input('请输入区间的上界: ');
r_data = size(x,1);
M = max([a-min(x),max(x)-b]);
posit_x = zeros(r_data,1);
for i = 1:r_data
if x(i)<a
posit_x(i) = 1-(a-x(i))/M;
elseif x(i)>b
posit_x(i) = 1-(x(i)-b)/M;
else
posit_x(i) = 1;
end
end
disp(['第' num2str(i) '列区间型正向化处理完成'] )
disp('--------------------分界线--------------------')
else
disp('没有这种类型的指标,请检查Type向量中是否有除了1、2、3之外的其他值')
end
end
边栏推荐
- [today in history] June 28: musk was born; Microsoft launches office 365; The inventor of Chua's circuit was born
- 5A同步整流芯片 20V转12V2A/5V4.5A大电流 24W大功率同步整流芯片 大电流降压IC FS2462
- Arduino-ESP32闪存文件插件程序搭建和上传
- StackOverflow 2022数据库年度调查
- Rk3399 platform development series explanation (use part) pinctrl subsystem introduction - Video Introduction
- PHP抓取网页获取特定信息
- plt. Usage of savefig() and save path
- 泛海微FH511单片机IC方案小家电LED照明MCU丝印FH511
- 895. longest ascending subsequence
- Hang Seng Electronics: lightdb, a financial distributed database, has passed a number of evaluations by China Academy of communications technology
猜你喜欢

Fastposter v2.8.4 release e-commerce poster generator

为什么越来越多的用户放弃 Swagger,选择Apifox

Fh511+tp4333 form an outdoor mobile power lighting camping lamp scheme.

Solution to directory access of thinkphp6 multi-level controller

Stackoverflow 2022 database annual survey

STM32F1与STM32CubeIDE编程实例-矩阵键盘驱动

Forecast and Analysis on market scale and development trend of China's operation and maintenance security products in 2022

基于SSM实现水果蔬菜商城管理系统

Understand leveldb write operation

中二青年付杰的逆袭故事:从二本生到 ICLR 杰出论文奖,我用了20年
随机推荐
Watermaker of the Flink core
List集合转数组
Tiantian mathematics serial 53: February 22
An idea plug-in that automatically generates unit tests, which improves the development efficiency by more than 70%!
FH511+TP4333组成一个户外移动电源照明野营灯方案。
简历模板百度网盘自取
一文抄 10 篇!韩国发表的顶级会议论文被曝抄袭,第一作者是“原罪”?
go template with... End traversal usage
Mysq 8.0 launched histogram, which greatly improved the performance!
You must configure either the server or JDBC driver (via the ‘serverTimezone‘ configuration property
如何在熊市中寻找机会?
How to set auto format after saving code in vscade
Customize MySQL connection pool
Mobile web training day-2
php获取数字的个位数并替换为指定的尾数
新品体验:阿里云新一代本地SSD实例i4开放公测
Mysql database literacy, do you really know what a database is
Rk3399 platform development series explanation (use part) pinctrl subsystem introduction - Video Introduction
Professional English calendar questions
在线JSON转PlainText工具