当前位置:网站首页>【笔记】电商订单数据分析实战
【笔记】电商订单数据分析实战
2022-07-01 05:41:00 【Sprite.Nym】
一、电商订单数据分析实战
1.1 数据清洗
# %load prep.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
ec_df = pd.read_csv('datas/ecomm/某电商平台2021年订单数据.csv', index_col=0)
# 转换orderTime和payTime为datetime
ec_df['orderTime'] = pd.to_datetime(ec_df['orderTime'])
ec_df['payTime'] = pd.to_datetime(ec_df['payTime'])
ec_df
# 提取出orderTime在2021年的数据
ec_df = ec_df[ec_df['orderTime'].dt.year == 2021]
ec_df
# 提取出payTime不晚于orderTime 30分钟的数据、支付金额和订单金额大于0的数据
ec_df = ec_df[(ec_df['payTime'] - ec_df['orderTime']).dt.total_seconds() <= 1800]
ec_df = ec_df[ec_df['payTime'] >= ec_df['orderTime']]
ec_df = ec_df[(ec_df['payment'] >= 0) & (ec_df['orderAmount'] >= 0)]
ec_df
# 填充chanelID列空值
ec_df['chanelID'] = ec_df['chanelID'].fillna(ec_df['chanelID'].mode()[0])
ec_df.info()
# 给渠道列和平台列改名
ec_df = ec_df.rename(columns={
'chanelID': 'channelID', 'platfromType': 'platformType'})
ec_df
# 改平台类型的错字
ser = ec_df['platformType'].replace(r'[\s·]', '', regex=True)
ser = ser.str.title()
ec_df['platformType'] = ser.replace(['薇信', 'Vx', '网页', '网站'],
['微信', '微信', 'Web', 'Web'])
# 将payment大于orderAmount的数据改过来
temp_df = ec_df[ec_df['payment'] <= ec_df['orderAmount']]
mean_discount = (temp_df['payment'] / temp_df['orderAmount']).mean()
ec_df['payment'] = np.round(
ec_df['payment'].where(
ec_df['payment'] <= ec_df['orderAmount'],
ec_df['orderAmount'] * mean_discount
)
)
1.2 数据分析
1.2.1 计算总体指标
# 计算GMV
GMV = ec_df['orderAmount'].sum()
print(GMV)
# 计算总销售额
all_payment = ec_df['payment'].sum()
print(all_payment)
# 计算实际销售额
true_payment = ec_df[ec_df['chargeback'] == False]['payment'].sum()
print(true_payment)
# 计算退货率
chargeback_ratio = f"{
100 * len(ec_df[ec_df['chargeback']]) / len(ec_df):.2f}%"
print(chargeback_ratio)
# 计算客单价
ARPU = true_payment / ec_df['userID'].nunique()
print(ARPU)
1.2.2 计算每月GMV及趋势分析
# 添加月份列
ec_df['month'] = ec_df['orderTime'].dt.month
ec_df
# 添加真实付款金额列
temp_df = ec_df
temp_df['true_payment'] = temp_df['payment'].where(ec_df['chargeback'] == False, 0)
# 生成月份对GMV和实际付款额透视表
monthly_gmv = pd.pivot_table(temp_df, index='month', values=['orderAmount', 'true_payment'], aggfunc='sum')
monthly_gmv = monthly_gmv.applymap(lambda x: np.round(x/10000, 2))
# 开始画图
import pyecharts.options as opts
from pyecharts.charts import Line
gmv_line = Line()
# 加入月份数据
gmv_line.add_xaxis(monthly_gmv.index.to_list())
# 加入GMV数据
gmv_line.add_yaxis('GMV', monthly_gmv['orderAmount'].to_list())
# 加入净销售额数据
gmv_line.add_yaxis('实际销售额', monthly_gmv['true_payment'].to_list())
# 修改全局配置
gmv_line.set_global_opts(
title_opts=opts.TitleOpts(title="2021年按月销售额图"),
yaxis_opts=opts.AxisOpts(name='GMV(单位:万元)'),
xaxis_opts=opts.AxisOpts(name='时间(单位:月)')
)
gmv_line.render_notebook()

1.2.3 流量渠道来源拆解GMV占比
# 生成平台、GMV透视表
channel_gmv = pd.pivot_table(temp_df, index='channelID', values=['orderAmount'], aggfunc='sum')
channel_gmv = channel_gmv.applymap(lambda x: x/ 10000)
channel_gmv
# 开始画图
from pyecharts.charts import Pie
# 创建饼图对象
channel_pie = Pie()
# 加入数据,设置饼图内外环大小
channel_pie.add(
'',
channel_gmv.reset_index().values.tolist(),
radius=["50%", "75%"]
)
# 配置全局设置
channel_pie.set_global_opts(
title_opts=opts.TitleOpts(title="各渠道GMV占比"),
legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%")
)
# 配置标签显示数值的百分比
channel_pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
channel_pie.render_notebook()

1.2.4 按星期几统计下单量
# 获取orderTime对应的星期几
temp_df['weekday'] = temp_df['orderTime'].dt.weekday.map(lambda x: f'星期{
x + 1}')
temp_df
# 生成星期几和下单量的透视表
weekday_count = pd.pivot_table(temp_df, index='weekday', values='orderID', aggfunc='nunique')
weekday_count
# 开始画图
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
# 创建柱状图对象
weekday_bar = Bar({
"theme": ThemeType.LIGHT})
# 添加x轴数据
weekday_bar.add_xaxis(weekday_count.index.tolist())
# 添加y轴数据
weekday_bar.add_yaxis("下单量", weekday_count['orderID'].tolist())
# 配置全局设置加标题
weekday_bar.set_global_opts(title_opts=opts.TitleOpts(title="各星期数下单量"))
# 添加标记线
weekday_bar.set_series_opts(
label_opts=opts.LabelOpts(is_show=True),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="min", name="最小值"),
opts.MarkLineItem(type_="max", name="最大值")
]
),
)
weekday_bar.render_notebook()

1.2.5 根据下单时段统计下单量(30分钟一段)
# 创建时间段列
temp_df['hour'] = temp_df['orderTime'].dt.floor('30T').dt.strftime('%H:%M')
temp_df
# 生成时段和下单量的透视表
time_count = pd.pivot_table(temp_df, index='hour', values='orderID', aggfunc='nunique')
time_count
# 创建柱状图对象
weekday_bar = Bar({
"theme": ThemeType.ESSOS})
# 添加x轴数据
weekday_bar.add_xaxis(time_count.index.tolist())
# 添加y轴数据
weekday_bar.add_yaxis("下单量", time_count['orderID'].tolist())
# 配置全局设置加标题
weekday_bar.set_global_opts(
title_opts=opts.TitleOpts(title="各时段下单量"),
datazoom_opts=opts.DataZoomOpts()
)
# 添加标记线
weekday_bar.set_series_opts(
label_opts=opts.LabelOpts(is_show=True),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="min", name="最小值"),
opts.MarkLineItem(type_="max", name="最大值")
]
),
)
weekday_bar.render_notebook()

1.2.6 按月统计复购率
# 生成按月的每个userID统计不重复orderID的透视表
multiple_bought = pd.pivot_table(temp_df, index='userID', columns='month', values='orderID', aggfunc='nunique')
multiple_bought = multiple_bought.applymap(
lambda x: np.nan if np.isnan(x) else (
0 if x == 1 else 1
)
)
np.round(100 * multiple_bought.sum() / multiple_bought.count(), 2)
# 图就不画了,笑死
边栏推荐
- 葫芦儿 APP 使用帮助
- 我从技术到产品经理的几点体会
- Chapitre d'apprentissage mongodb: Introduction à la première leçon après l'installation
- 了解 JVM 中几个相关问题 — JVM 内存布局、类加载机制、垃圾回收
- HCM 初学 ( 一 ) - 简介
- Speed regulation and stroke control based on Ti drv8424 driving stepper motor
- 从底层结构开始学习FPGA----RAM IP的定制与测试
- Simple implementation of database connection pool
- Unity项目心得总结
- Numeric amount plus comma; JS two methods of adding three digits and a comma to numbers; JS data formatting
猜你喜欢

Actual combat: basic use of Redux

Leetcode top 100 question 2 Add two numbers

Advanced cross platform application development (II): uni app practice

为了保护自己的数据,他奋斗了一天一夜

Cockroachdb: the resistant geo distributed SQL database paper reading notes

从诺奖知“边缘计算”的未来!

El cascade echo failed; El cascader does not echo

为什么用葫芦儿派盘取代U盘?

Deeply understand the underlying implementation principle of countdownlatch in concurrent programming

【医学分割】u2net
随机推荐
2/15 (awk, awk conditions, awk processing design can perform additional tasks, and use awk array +for loop to realize advanced search)
Detailed explanation of set
为什么用葫芦儿派盘取代U盘?
导数的左右极限和左右导数的辨析
Fiber Bragg grating (FBG) notes [1]: waveguide structure and Bragg wavelength derivation
Flowable source code comment (XXXIX) task listener
What is the at instruction set often used in the development of IOT devices?
Cockroachdb: the resistant geo distributed SQL database paper reading notes
穿越派·派盘 + 思源笔记 = 私人笔记本
ssm+mysql二手交易网站(论文+源码获取链接)
Set set detailed explanation
C语言初阶——实现扫雷游戏
Trust guessing numbers game
Codeforces Round #803 (Div. 2)vp
College community management system based on boot+jsp (with source code download link)
输入一个表达式(用字符串表示),求这个表达式的值。
excel高级绘图技巧100讲(一)-用甘特图来展示项目进度情况
Multi table operation - foreign key cascade operation
2022.6.30-----leetcode.1175
【考研高数 自用】高数第一章基础阶段思维导图