当前位置:网站首页>Multi card server usage
Multi card server usage
2022-06-30 02:41:00 【MallocLu】
Reference article :https://blog.csdn.net/qq_42255269/article/details/123427094?spm=1001.2014.3001.5506
The sample program
The simplest way to use vgg16 Model Yes CIFAR10 Data sets Conduct 10 classification
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
device = 'cuda'
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size)
# Create a network model
net = MyNet().to(device=device)
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
imgs = imgs.to(device=device)
targets = targets.to(device=device)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
single Card machines single The card runs
python main.py --batch_size 16
many Card machines single The card runs
# card 0 function
CUDA_VISIBLE_DEVICES=0 python main.py --batch_size 16
# card 1 function
CUDA_VISIBLE_DEVICES=1 python main.py --batch_size 16
many Card machines many The card runs
nn.DataParallel( Not recommended )
advantage : Easy to use , Minor changes to the code ( only 3 To modify )
shortcoming : Just improve the training speed, but not batch_size( The data needs to be loaded into the master first GPU Perform other operations on the , All masters GPU The video memory of is limited batch_size Size )
# batch_size The size should be an integral multiple of the number of graphics cards
python main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
# modify 1
gpus = [0, 1]
device = 'cuda:{}'.format(gpus[0])
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size)
# modify 2 Create a network model
net = nn.DataParallel(MyNet().to(device=device), device_ids=gpus, output_device=gpus[0])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 3
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
torch.distributed( recommend )
advantage : Can improve training speed and batch_size
shortcoming : It's a little complicated to use , Dataset cannot shuffle
The new version
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.run --nproc_per_node=2 main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
import torch.distributed as dist
import os
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
args = parser.parse_args()
# modify 1
local_rank = int(os.environ["LOCAL_RANK"])
# modify 2 Set up GPU Back end and port used for communication between
dist.init_process_group(backend='nccl')
# modify 3
device = 'cuda:{}'.format(local_rank)
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
# modify 4 Be careful not to set DataLoader Of shuffle by True
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler)
# modify 5 Create a network model
net = nn.parallel.DistributedDataParallel(MyNet().to(device), device_ids=[local_rank])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 6
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
The old version torch.distributed.launch
CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 main.py --batch_size 512
from torch import nn
import torchvision
import argparse
from torch.utils.data import DataLoader
import torch
import torch.distributed as dist
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.vgg16 = torchvision.models.vgg16()
self.fc = nn.Linear(1000, 10)
def forward(self, x):
out = self.vgg16(x)
out = self.fc(out)
return out
if __name__ == '__main__':
# obtain batch_size Parameters
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int)
# modify 1
parser.add_argument('--local_rank', default=-1, type=int,
help='node rank for distributed training')
args = parser.parse_args()
# modify 2 Set up GPU Back end and port used for communication between
dist.init_process_group(backend='nccl')
# modify 3
device = 'cuda:{}'.format(args.local_rank)
# Configure datasets
train_data = torchvision.datasets.CIFAR10(root="data", train=True, transform=torchvision.transforms.ToTensor(), download=True)
# modify 4 Be careful not to set DataLoader Of shuffle by True
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler)
# modify 5 Create a network model
net = nn.parallel.DistributedDataParallel(MyNet().to(device), device_ids=[args.local_rank])
# Loss function
loss_fn = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
for i in range(1000):
for step, data in enumerate(train_dataloader):
imgs, targets = data
# modify 6
imgs = imgs.to(device=device, non_blocking=True)
targets = targets.to(device=device, non_blocking=True)
outputs = net(imgs)
loss = loss_fn(outputs, targets)
# Optimizer optimization model
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("epoch:{} step:{} loss:{}".format(i, step, loss.item()))
边栏推荐
- Pytoch learning (II)
- 公司电脑强制休眠的3种解决方案
- 2. < tag dynamic programming and 0-1 knapsack problem > lt.416 Split equal sum subset + lt.1049 Weight of the last stone II
- How do PMP candidates respond to the new exam outline? Look!
- Jupyter notebook显示k线图集合
- Xunwei enzhipu ITop - imx6 Development Platform
- SSL证书格式转化的两种方法
- How to prevent phishing emails? S/mime mail certificate
- 多卡服务器使用
- How difficult is the PMP Exam under the new syllabus? Comprehensive analysis
猜你喜欢
What is a self signed certificate? Advantages and disadvantages of self signed SSL certificates?
IBM WebSphere channel connectivity setup and testing
Several key points recorded after reviewing redis design and Implementation
Summary of knowledge points about eigenvalues and eigenvectors of matrices in Chapter 5 of Linear Algebra (Jeff's self perception)
FAQs for code signature and driver signature
Insert sort directly
2.8 【 weight of complete binary tree 】
What are the requirements for NPDP product manager international certification examination?
SSL证书七大常见错误及解决方法
【干货分享】最新WHQL徽标认证申请流程
随机推荐
LeetCode 3. 无重复字符的最长子串
What is the difference between a layer 3 switch and a layer 2 switch
Bubble sort
Lua 基础知识
银行的理财产品一般期限是多久?
CA数字证书包含哪些文件?如何查看SSL证书信息?
CMake教程系列-02-使用cmake代码生成二进制
SSL证书格式转化的两种方法
SQLite使用
Recursion frog jumping steps problem
CMake教程系列-05-选项及变量
JS advanced -es6 syntax
Sitelock nine FAQs
PEM_ read_ bio_ Privatekey() returns null only in ECB mode - PEM_ read_ bio_ PrivateKey() returns NULL in ECB mode only
[NPM] solve the problem of error reporting when installing typeorm with NPM
RAII内存管理
走进江苏作家诗人胭脂茉莉|世界读书日
AutoJS代码能加密吗?YES,AutoJS加密技巧展示
Ffmpeg source code
Seven common errors of SSL certificate and their solutions