当前位置:网站首页>Multi card server usage
Multi card server usage
2022-06-30 02:41:00 【MallocLu】
Reference article :https://blog.csdn.net/qq_42255269/article/details/123427094?spm=1001.2014.3001.5506
The sample program
The simplest way to use vgg16 Model Yes CIFAR10 Data sets Conduct 10 classification
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
device = 'cuda'
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size)
# Create a network model
net = MyNet().to(device=device)
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
imgs = imgs.to(device=device)
targets = targets.to(device=device)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
single Card machines single The card runs
python main.py --batch_size 16
many Card machines single The card runs
# card 0 function
CUDA_VISIBLE_DEVICES=0 python main.py --batch_size 16
# card 1 function
CUDA_VISIBLE_DEVICES=1 python main.py --batch_size 16
many Card machines many The card runs
nn.DataParallel( Not recommended )
advantage : Easy to use , Minor changes to the code ( only 3 To modify )
shortcoming : Just improve the training speed, but not batch_size( The data needs to be loaded into the master first GPU Perform other operations on the , All masters GPU The video memory of is limited batch_size Size )
# batch_size The size should be an integral multiple of the number of graphics cards
python main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
# modify 1
gpus = [0, 1]
device = 'cuda:{}'.format(gpus[0])
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size)
# modify 2 Create a network model
net = nn.DataParallel(MyNet().to(device=device), device_ids=gpus, output_device=gpus[0])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 3
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
torch.distributed( recommend )
advantage : Can improve training speed and batch_size
shortcoming : It's a little complicated to use , Dataset cannot shuffle
The new version
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.run --nproc_per_node=2 main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
import torch.distributed as dist
import os
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
# modify 1
local_rank = int(os.environ["LOCAL_RANK"])
# modify 2 Set up GPU Back end and port used for communication between
dist.init_process_group(backend='nccl')
# modify 3
device = 'cuda:{}'.format(local_rank)
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
# modify 4 Be careful not to set DataLoader Of shuffle by True
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler)
# modify 5 Create a network model
net = nn.parallel.DistributedDataParallel(MyNet().to(device), device_ids=[local_rank])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 6
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
The old version torch.distributed.launch
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
import torch.distributed as dist
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
# modify 1
parser.add_argument('--local_rank', default=-1, type=int,
help='node rank for distributed training')
args = parser.parse_args()
# modify 2 Set up GPU Back end and port used for communication between
dist.init_process_group(backend='nccl')
# modify 3
device = 'cuda:{}'.format(args.local_rank)
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
# modify 4 Be careful not to set DataLoader Of shuffle by True
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler)
# modify 5 Create a network model
net = nn.parallel.DistributedDataParallel(MyNet().to(device), device_ids=[args.local_rank])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 6
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
边栏推荐
- PMP考生如何应对新考纲?看过来!
- LeetCode 3. 无重复字符的最长子串
- How long is the general term of the bank's financial products?
- Can autojs code be encrypted? Yes, display of autojs encryption skills
- Linear algebra Chapter 3 summary of vector and vector space knowledge points (Jeff's self perception)
- threejs 镜子案例Reflector 创建镜子+房子搭建+小球移动
- Dynamic SQL
- 走进江苏作家诗人胭脂茉莉|世界读书日
- 速看 2021-2022年23项重大网络犯罪统计数据
- Global and Chinese market of relay lens 2022-2028: Research Report on technology, participants, trends, market size and share
猜你喜欢

Digicert、Sectigo、Globalsign代码签名证书的区别

2.< tag-动态规划和0-1背包问题>lt.416. 分割等和子集 + lt.1049. 最后一块石头的重量 II

HTA入门基础教程 | VBS脚本的GUI界面 HTA简明教程 ,附带完整历程及界面美化

What is digicert smart seal?
![[dry goods sharing] the latest WHQL logo certification application process](/img/c3/37277572c70b0af944e594f0965a6c.png)
[dry goods sharing] the latest WHQL logo certification application process

速看 2021-2022年23项重大网络犯罪统计数据

2. < tag dynamic programming and 0-1 knapsack problem > lt.416 Split equal sum subset + lt.1049 Weight of the last stone II

主流CA吊销俄罗斯数字证书启示:升级国密算法SSL证书,助力我国网络安全自主可控

Two methods of SSL certificate format conversion

五个最便宜的通配符SSL证书品牌
随机推荐
Global and Chinese market of wind energy equipment logistics 2022-2028: Research Report on technology, participants, trends, market size and share
Recursion frog jumping steps problem
重看《Redis设计与实现》后记录几个要点
NPDP产品经理国际认证考试报名有什么要求?
Raki's notes on reading paper: neighborhood matching network for entity alignment
Call collections Sort() method, compare two person objects (by age ratio first, and by name ratio for the same age), and pass lambda expression as a parameter.
002 color classification
Série de tutoriels cmake - 02 - génération de binaires à l'aide du Code cmake
Enlightenment from the revocation of Russian digital certificate by mainstream CA: upgrade the SSL certificate of state secret algorithm to help China's network security to be autonomous and controlla
AutoJS代码能加密吗?YES,AutoJS加密技巧展示
What is certificate transparency CT? How to query CT logs certificate logs?
SQLite使用
迅為恩智浦iTOP-IMX6開發平臺
模板参数包和函数参数包
速看 2021-2022年23项重大网络犯罪统计数据
AutoJS代碼能加密嗎?YES,AutoJS加密技巧展示
What is an X.509 certificate? 10. 509 certificate working principle and application?
SSL证书七大常见错误及解决方法
SiteLock九个常见问题
Xunwei NXP itop-imx6 development platform