当前位置:网站首页>Multi card server usage
Multi card server usage
2022-06-30 02:41:00 【MallocLu】
Reference article :https://blog.csdn.net/qq_42255269/article/details/123427094?spm=1001.2014.3001.5506
The sample program
The simplest way to use vgg16 Model Yes CIFAR10 Data sets Conduct 10 classification
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
device = 'cuda'
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size)
# Create a network model
net = MyNet().to(device=device)
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
imgs = imgs.to(device=device)
targets = targets.to(device=device)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
single Card machines single The card runs
python main.py --batch_size 16
many Card machines single The card runs
# card 0 function
CUDA_VISIBLE_DEVICES=0 python main.py --batch_size 16
# card 1 function
CUDA_VISIBLE_DEVICES=1 python main.py --batch_size 16
many Card machines many The card runs
nn.DataParallel( Not recommended )
advantage : Easy to use , Minor changes to the code ( only 3 To modify )
shortcoming : Just improve the training speed, but not batch_size( The data needs to be loaded into the master first GPU Perform other operations on the , All masters GPU The video memory of is limited batch_size Size )
# batch_size The size should be an integral multiple of the number of graphics cards
python main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
# modify 1
gpus = [0, 1]
device = 'cuda:{}'.format(gpus[0])
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size)
# modify 2 Create a network model
net = nn.DataParallel(MyNet().to(device=device), device_ids=gpus, output_device=gpus[0])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 3
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
torch.distributed( recommend )
advantage : Can improve training speed and batch_size
shortcoming : It's a little complicated to use , Dataset cannot shuffle
The new version
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.run --nproc_per_node=2 main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
import torch.distributed as dist
import os
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
# modify 1
local_rank = int(os.environ["LOCAL_RANK"])
# modify 2 Set up GPU Back end and port used for communication between
dist.init_process_group(backend='nccl')
# modify 3
device = 'cuda:{}'.format(local_rank)
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
# modify 4 Be careful not to set DataLoader Of shuffle by True
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler)
# modify 5 Create a network model
net = nn.parallel.DistributedDataParallel(MyNet().to(device), device_ids=[local_rank])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 6
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
The old version torch.distributed.launch
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
import torch.distributed as dist
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
# modify 1
parser.add_argument('--local_rank', default=-1, type=int,
help='node rank for distributed training')
args = parser.parse_args()
# modify 2 Set up GPU Back end and port used for communication between
dist.init_process_group(backend='nccl')
# modify 3
device = 'cuda:{}'.format(args.local_rank)
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
# modify 4 Be careful not to set DataLoader Of shuffle by True
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler)
# modify 5 Create a network model
net = nn.parallel.DistributedDataParallel(MyNet().to(device), device_ids=[args.local_rank])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 6
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
边栏推荐
- Summary of knowledge points about eigenvalues and eigenvectors of matrices in Chapter 5 of Linear Algebra (Jeff's self perception)
- Linear algebra Chapter 4 Summary of knowledge points of linear equations (Jeff's self perception)
- Entering Jiangsu writers and poets carmine Jasmine World Book Day
- CMake教程系列-05-选项及变量
- What is a self signed certificate? Advantages and disadvantages of self signed SSL certificates?
- 微信小程序页面跳转以及参数传递
- NPDP产品经理国际认证考试报名有什么要求?
- Creating exquisite skills in maker Education
- [论]【DSTG】Dynamic SpatiotemporalGraph Convolutional Neural Networks for Traffic Data Imputation
- 2022年7月深圳地区CPDA数据分析师认证
猜你喜欢

FAQs for code signature and driver signature

什么是X.509证书?X.509证书工作原理及应用?

学术汇报(academic presentation)/PPT应该怎么做?
![[on] [DSTG] dynamic spatiotemporalgraph revolutionary neural networks for traffic data impact](/img/c3/f9d6399c931a006ca295bb1e3ac427.png)
[on] [DSTG] dynamic spatiotemporalgraph revolutionary neural networks for traffic data impact

Call collections Sort() method, compare two person objects (by age ratio first, and by name ratio for the same age), and pass lambda expression as a parameter.

Linear algebra Chapter 3 summary of vector and vector space knowledge points (Jeff's self perception)

隐藏在科技教育中的steam元素

Quick sort

重磅来袭--UE5的开源数字孪生解决方案

Unity TimeLine 数据绑定
随机推荐
AutoJS代码能加密吗?YES,AutoJS加密技巧展示
【干货分享】最新WHQL徽标认证申请流程
C语言 pivot_root的Invalid argument错误解决方案
IBM WebSphere channel connectivity setup and testing
[dry goods sharing] the latest WHQL logo certification application process
如何使用SMS向客户传递服务信息?指南在这里!
学术汇报(academic presentation)/PPT应该怎么做?
迅為恩智浦iTOP-IMX6開發平臺
Some configuration details about servlet initial development
RAII内存管理
What files does a CA digital certificate contain? How to view SSL certificate information?
Cmake tutorial series-03-dependency management
002 color classification
Template parameter package and function parameter package
Raki's notes on reading paper: named entity recognition as dependency parsing
Pytorch学习(二)
How to use SMS to deliver service information to customers? The guide is here!
Cmake tutorial series -04- compiling related functions
c#控制台格式化代码
【postgres】postgres 数据库迁移