当前位置:网站首页>Pytorch —— 分布式模型训练
Pytorch —— 分布式模型训练
2022-08-01 13:52:00 【CyrusMay】
1.数据并行
1.1 单机单卡
import torch
from torch import nn
import torch.nn.functional as F
import os
model = nn.Sequential(nn.Linear(in_features=10,out_features=20),
nn.ReLU(),
nn.Linear(in_features=20,out_features=2),
nn.Sigmoid())
data = torch.rand([100,10])
optimizer = torch.optim.Adam(model.parameters(),lr = 0.001)
print(torch.cuda.is_available())
# 指定只用一张显卡
# 可在终端运行 CUDA_VISIBLE_DEVICES="0"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# 选定显卡
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 模型拷贝
model.to(device)
# 数据拷贝
data = data.to(device)
# 模型存储
torch.save({
"model_state_dict":model.state_dict(),
"optimizer_state_dict":optimizer.state_dict()},"./model")
# 模型加载
checkpoint = torch.load("./model",map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
1.2 单机多卡
代码
import torch
import torch.nn.functional as F
from torch import nn
import os
# 获取当前gpu的编号
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda",local_rank)
dataset = torch.rand([1000,10])
model = nn.Sequential(
nn.Linear(),
nn.ReLU(),
nn.Linear(),
nn.Sigmoid()
)
optimizer = torch.optim.Adam(model.parameters,lr=0.001)
# 检测GPU的数目
n_gpus = torch.cuda.device_count()
# 初始化一个进程组
torch.distributed.init_process_group(backend="nccl",init_method="env://") # backend为通讯方式
# 模型拷贝,放入DistributedDataParallel
model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[local_rank],output_device=local_rank)
# 构建分布式的sampler
sampler = torch.utils.data.distributed.DistributedSampler(dataset)
# 构建dataloader
BATCH_SIZE = 128
dataloader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=BATCH_SIZE,
num_workers = 8,
sampler = sampler)
for epoch in range(1000):
for x in dataloader:
sampler.set_epoch(epoch) # 起到不同的shuffle作用
if local_rank == 0:
# 模型存储
torch.save({
"model_state_dict":model.module.state_dict()
},"./model")
# 模型加载
checkpoint = torch.load("./model",map_location=local_rank)
model.load_state_dict(checkpoint["model_state_dict"],
)
在终端起任务
torchrun --nproc_per_node=n_gpus train.py
1.3 多机多卡
代码
import torch
import torch.nn.functional as F
from torch import nn
import os
# 获取当前gpu的编号
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
device = torch.device("cuda",local_rank)
dataset = torch.rand([1000,10])
model = nn.Sequential(
nn.Linear(),
nn.ReLU(),
nn.Linear(),
nn.Sigmoid()
)
optimizer = torch.optim.Adam(model.parameters,lr=0.001)
# 检测GPU的数目
n_gpus = torch.cuda.device_count()
# 初始化一个进程组
torch.distributed.init_process_group(backend="nccl",init_method="env://") # backend为通讯方式
# 模型拷贝,放入DistributedDataParallel
model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[local_rank],output_device=local_rank)
# 构建分布式的sampler
sampler = torch.utils.data.distributed.DistributedSampler(dataset)
# 构建dataloader
BATCH_SIZE = 128
dataloader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=BATCH_SIZE,
num_workers = 8,
sampler = sampler)
for epoch in range(1000):
for x in dataloader:
sampler.set_epoch(epoch) # 起到不同的shuffle作用
if local_rank == 0:
# 模型存储
torch.save({
"model_state_dict":model.module.state_dict()
},"./model")
# 模型加载
checkpoint = torch.load("./model",map_location=local_rank)
model.load_state_dict(checkpoint["model_state_dict"],
)
终端起任务
在每个节点上都执行一次
torchrun --nproc_per_node=n_gpus --nodes=2 --node_rank=0 --master_addr="主节点IP" --master_port="主节点端口号" train.py
2 模型并行
略
by CyrusMay 2022 07 29
边栏推荐
猜你喜欢
随机推荐
批量替换Word中的表格为图片并保存
使用ffmpeg来查看视频的信息,fps,和width,height
【2022蓝帽杯】file_session && 浅入opcode
Efficiency tools to let programmers get off work earlier
[LiteratureReview]Optimal and Robust Category-level Perception: Object Pose and Shape Estimation f
【每日一题】1161. 最大层内元素和
The obstacles to put Istio into production and how we solve them
CCS软件安装教程(超级详细)「建议收藏」
PAT1165 Block Reversing(25)
PanGu-Coder:函数级的代码生成模型
四足机器人软件架构现状分析
易优压双驱挖掘机压路机器类网站源码 v1.5.8
Six Stones Programming: Problems must be faced, methods must be skillful, and functions that cannot be done well must be solved
tensorflow2.0 handwritten digit recognition (tensorflow handwriting recognition)
PAT 1162 Postfix Expression(25)
态路小课堂丨浅谈优质光模块需要具备的条件!
【每日一题】1331. 数组序号转换
透过开发抽奖小程序,体会创新与迭代
kubernetes之DaemonSet以及滚动更新
【StoneDB Class】Introduction Lesson 2: Analysis of the Overall Architecture of StoneDB