http://note.youdao.com/noteshare?id=3d48448b36fedd9c08d1f9f6193e94bc
0
torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) 打开pin_memory打开,就省掉了将数据从CPU传入到缓存RAM里面,再给传输到GPU上; 为True时是直接映射到GPU的相关内存块上,省掉了一点数据传输时间。
GPU
使用相同的模型副本,但是使用不同的数据批进行训练。数据并行的具体原理流程为:
controller
,一般设置为 cuda:0
controller
模型复制(broadcast
)到每一个指定的 GPU
上batch
,进行均分,分别作为各对应副本的输入 (scatter
)gather
)到 controller
设备,并进行求和 (reduced add
)During the backwards pass, gradients from each replica are summed into the original module
.controller
设备上的参数(其他设备也会被更新)net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) output = net(input_var) # input_var can be on any device, including CP
# 流程伪代码 def train_batch(data, k): split data into k parts for i = 1, ..., k: # run in parallel compute grad_i w.r.t. weight_i using data_i on the i-th GPU grad = grad_1 + ... + grad_k for i = 1, ..., k: # run in parallel copy grad to i-th GPU update weight_i by using grad
GPU
上,并将中间结果在 GPU
之间进行传递,使用同一部分数据。解决了单个模型太大,不能存放于一个 GPU
的情况。然而,需要注意的是,相较于在单个 GPU
上运行,其速度更慢。class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.features_1 = nn.Sequential( nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3), nn.BatchNorm2d(16), nn.ReLU(inplace=True), # 30 ...... nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3), nn.BatchNorm2d(128), nn.ReLU(inplace=True), # 12 ).to('cuda:0') self.features_2 = nn.Sequential( nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2), nn.BatchNorm2d(256), nn.ReLU(inplace=True), # 5 ......).to('cuda:1') # 1 self.classifier = nn.Sequential( nn.Dropout(), ...... nn.Linear(1024, class_num)).to('cuda:1') def forward(self, x): out = self.features_1(x.to('cuda:0')) out = self.features_2(out.to('cuda:1')) out = out.view(-1, 384) out = self.classifier(out) out = F.softmax(out, dim=1) return out # 此时,不在此需要使用 model = model.cuda() model = ToyModel() loss_fn = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.001) optimizer.zero_grad() for data in trainloader: images, labels = data # 要处理的部分 images = images.to('cuda:0') labels = labels.to('cuda:1') # 必须与输出所在 GPU 一致 outputs = net(images) loss = criterion(outputs, labels) loss.backward() optimizer.step()
代码:https://github.com/gongel/ML/blob/master/ex8/ex8_2.py