当前位置:网站首页>Chapter 7 XGBoost
Chapter 7 XGBoost
2022-07-29 16:14:00 【桑之未落0208】
1 XGBoost简介
- XGBoost是使用梯度提升框架实现的高效、灵活、可移植的机器学习库,全称是eXtreme Gradient Boosting,是GBDT(GBM)的一个C++实现。它将树的生成并行完成,从而提高学习速度。
- 一般地说,XGBoost的速度和性能优于sklearn.ensemble.GradientBoostingClassifier类。
- XGBoost的作者为华盛顿大学陈天奇,并封装了Python接口,随着在机器学习竞赛中的优异表现,其他学者封装完成了R/Julia等接口。
- 在实际的工作中,效果不错。
代码:
数据

训练
预测

2 Kaggle简介
Kaggle是一个数据分析的竞赛平台。
3 代码实现
泰坦尼克号的生死预测
数据说明:
pclass:舱位
sex:性别。使用0、1来标记男性和女性。
age:年龄,存在部分缺失值。使用决策树or随机森林or根据舱位,将舱位相同的均值作为该舱位存在缺失值的人的年龄等等。
fare票价,与舱位有直接的联系。
cabin房间号:缺失值严重。
embarked:起始城市,有三个C、Q、S,外加缺失值Unknow,所以人为将embarked变成四列,
S——1000 C——0100 Q——0010

数据预处理:
代码:
使用logistic回归/随机森林/XGBoost
import xgboost as xgb
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import csv
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
acc_rate = 100 * float(acc.sum()) / a.size
print '%s正确率:%.3f%%' % (tip, acc_rate)
return acc_rate
def load_data(file_name, is_train):
data = pd.read_csv(file_name) # 数据文件路径
# print 'data.describe() = \n', data.describe()
# 性别
data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
# 补齐船票价格缺失值
if len(data.Fare[data.Fare.isnull()]) > 0:
fare = np.zeros(3)
for f in range(0, 3):
fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
for f in range(0, 3): # loop 0 to 2
data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]
# 年龄:使用均值代替缺失值
# mean_age = data['Age'].dropna().mean()
# data.loc[(data.Age.isnull()), 'Age'] = mean_age
if is_train:
# 年龄:使用随机森林预测年龄缺失值
print '随机森林预测缺失年龄:--start--'
data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据
age_null = data_for_age.loc[(data.Age.isnull())]
# print age_exist
x = age_exist.values[:, 1:]
y = age_exist.values[:, 0]
rfr = RandomForestRegressor(n_estimators=1000)
rfr.fit(x, y)
age_hat = rfr.predict(age_null.values[:, 1:])
# print age_hat
data.loc[(data.Age.isnull()), 'Age'] = age_hat
print '随机森林预测缺失年龄:--over--'
else:
print '随机森林预测缺失年龄2:--start--'
data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据
age_null = data_for_age.loc[(data.Age.isnull())]
# print age_exist
x = age_exist.values[:, 1:]
y = age_exist.values[:, 0]
rfr = RandomForestRegressor(n_estimators=1000)
rfr.fit(x, y)
age_hat = rfr.predict(age_null.values[:, 1:])
# print age_hat
data.loc[(data.Age.isnull()), 'Age'] = age_hat
print '随机森林预测缺失年龄2:--over--'
# 起始城市
data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S' # 保留缺失出发城市
# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
# print data['Embarked']
embarked_data = pd.get_dummies(data.Embarked)
print embarked_data
# embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
data = pd.concat([data, embarked_data], axis=1)
print data.describe()
data.to_csv('New_Data.csv')
x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
# x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = None
if 'Survived' in data:
y = data['Survived']
x = np.array(x)
y = np.array(y)
# 思考:这样做,其实发生了什么?
x = np.tile(x, (5, 1))
y = np.tile(y, (5, ))
if is_train:
return x, y
return x, data['PassengerId']
def write_result(c, c_type):
file_name = 'Titanic.test.csv'
x, passenger_id = load_data(file_name, False)
if type == 3:
x = xgb.DMatrix(x)
y = c.predict(x)
y[y > 0.5] = 1
y[~(y > 0.5)] = 0
predictions_file = open("Prediction_%d.csv" % c_type, "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId", "Survived"])
open_file_object.writerows(zip(passenger_id, y))
predictions_file.close()
if __name__ == "__main__":
x, y = load_data('Titanic.train.csv', True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
#
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
lr_acc = accuracy_score(y_test, y_hat)
# write_result(lr, 1)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)
y_hat = rfc.predict(x_test)
rfc_acc = accuracy_score(y_test, y_hat)
# write_result(rfc, 2)
# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
# 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
y_hat = bst.predict(data_test)
# write_result(bst, 3)
y_hat[y_hat > 0.5] = 1
y_hat[~(y_hat > 0.5)] = 0
xgb_acc = accuracy_score(y_test, y_hat)
print 'Logistic回归:%.3f%%' % lr_acc
print '随机森林:%.3f%%' % rfc_acc
print 'XGBoost:%.3f%%' % xgb_acc边栏推荐
猜你喜欢

win10 校验sha256

SSM整合案例分析(详解)

Twin all things digital visual | join the real world and the digital space

大规模线上应用TiDB会遇到的坑,本文都帮你排除好了

This article penetrates the architecture design and cluster construction of the distributed storage system Ceph (hands-on)

Groeb - "gramm, explicit and complete n -" gramm mask language model, implements the explicit n - "gramm semantic unit modeling knowledge.

CRM如何帮助企业营销获客

Chicken and rabbit in the same cage

【PCL学习笔记】点云处理常用的库和API(PCL库+Eigen)

不堆概念、换个角度聊多线程并发编程
随机推荐
[WeChat Mini Program] Component usage and attribute reference
【PCL学习笔记】点云处理常用的库和API(PCL库+Eigen)
uni-app深入学习之模板运用
sorting and searching 二分查找法
特殊的类——集合与泛型(C#)
2020年Mobileye收入近10亿美元,EyeQ芯片出货1930万颗
干货!如何使用仪表构造SRv6-TE性能测试环境
Tess4J 图片文字识别
大数阶乘计算
How CRM Helps Enterprise Marketing Acquire Customers
新建和编辑共用一个表单,编辑之后新建,form表单resetFields失效
木棒
leetcode:1901. 寻找峰值 II【二分找矩阵局部最大】
Turbine聚合监控
Automated win training script log
Store Information Management System
兆易创新2021年将从长鑫存储采购3亿美元DRAM产品
如何写好设计文档
店铺信息管理系统
This article penetrates the architecture design and cluster construction of the distributed storage system Ceph (hands-on)