1 year ago
#347608
wanghinc
Pytorch - Using more GPUs and increasing batch size makes training slower in DistributedDataParallel
I am trying to implement StyuleGAN2. My code works well when I am just using single GPU to do the training. I would like to speed up the training by utlilizing 8 GPUs by using DistributedDataParallel. However, I noticed that using more GPUs does not speed up the training for me at all. Instead, using more GPUs makes the training slower.
I also tried to modify the batch size and I noticed that batch size = 8 trains the model fastest. Increasing the batch size will makes the training significantly slower.
I tried to measure the time for each epoch and found the training time is significantly longer every 4 epochs.
EP0_elapsed_time: 3.3021082878112793 sec
EP1_elapsed_time: 0.8542821407318115 sec
EP2_elapsed_time: 0.7720010280609131 sec
EP3_elapsed_time: 7.11009407043457 sec
EP4_elapsed_time: 0.7670211791992188 sec
EP5_elapsed_time: 0.7623276710510254 sec
EP6_elapsed_time: 0.7690849304199219 sec
EP7_elapsed_time: 7.0614259243011475 sec
EP8_elapsed_time: 0.7806422710418701 sec
EP9_elapsed_time: 0.7751979827880859 sec
EP10_elapsed_time: 0.7685496807098389 sec
EP11_elapsed_time: 7.09734845161438 sec
EP12_elapsed_time: 0.7923364639282227 sec
EP13_elapsed_time: 0.7789566516876221 sec
EP14_elapsed_time: 0.7974681854248047 sec
EP15_elapsed_time: 7.120237350463867 sec
I notice a similar post and it has not been solved. No speedup doing multi GPU training with DistributedDataParallel vs. single GPU
How can I solve this issue?
main()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int, help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int, help='ranking/index within the nodes')
parser.add_argument('--epochs', default=400, type=int, metavar='N', help='number of total epochs to run')
parser.add_argument('--model_dir', default='stylegan2ada_002', type=str, help='model dir name')
parser.add_argument('--train_img_dir_path', default='./GAN/clean_2', type=str, help='training images dir path')
parser.add_argument('--img_size', default=64, type=int, help='target image size')
parser.add_argument('--batch_size', default=32, type=int, help='batch size')
parser.add_argument('--g_latent_dim', default=512, type=int, help='dim of generator noise z and w')
parser.add_argument('--mn_num_layers', default=8, type=int, help='number of layers in the mapping network (8 according to paper)')
parser.add_argument('--g_lr', default=1e-3, type=float, help='generator learning rate')
parser.add_argument('--d_lr', default=1e-3, type=float, help='discriminator learning rate')
parser.add_argument('--mn_lr', default=1e-5, type=float, help='mapping network learning rate')
parser.add_argument('--adam_betas', default=(0.0, 0.99), type=tuple, help='betas of adam optimizers')
parser.add_argument('--gradient_accumulate_steps', default=1, type=int, help='gradient accumulate steps')
parser.add_argument('--lazy_gradient_penalty_interval', default=4, type=int, help='lazy gradient penalty interval')
parser.add_argument('--lazy_path_penalty_after', default=5000, type=int, help='the point that starts to apply lazy path penalty')
parser.add_argument('--lazy_path_penalty_interval', default=32, type=int, help='lazy path penalty interval')
parser.add_argument('--gradient_penalty_coefficient', default=10., type=float, help='gradient penalty coefficient')
parser.add_argument('--style_mixing_prob', default=0.9, type=float, help='style mixing prob')
parser.add_argument('--generate_img_interval', default=100, type=int, help='generate images every x epochs')
parser.add_argument('--generate_img_after_percent', default=0.4, type=float, help='generate images after y% of the total epochs')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
args.distributed = True
args.dist_backend = 'nccl'
args.dist_url = 'env://'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '7788'
mp.spawn(train, nprocs=args.gpus, args=(args,))
train()
def train(gpu, args):
rank = args.nr * args.gpus + gpu
torch.cuda.set_device(gpu)
dist.init_process_group(
backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=rank,
)
dist.barrier()
# Measure time
start_init = time.time()
# Make dirs
### Create model dir if not existed
model_dir_path = f'./{args.model_dir}'
if not (os.path.exists(model_dir_path)):
try:
os.makedirs(model_dir_path)
except:
pass
### Create 'images' dir if not existed
img_dir_path = f'./{args.model_dir}/images'
if not (os.path.exists(img_dir_path)):
try:
os.makedirs(img_dir_path)
except:
pass
### Create 'checkpoints' dir if not existed
ckpt_dir_path = f'./{args.model_dir}/checkpoints'
if not (os.path.exists(ckpt_dir_path)):
try:
os.makedirs(ckpt_dir_path)
except:
pass
# Dataset and Dataloader
### Create the dataset
dataset = ImageDataset(path=args.train_img_dir_path, image_size=args.img_size)
sampler = torch.utils.data.distributed.DistributedSampler(
dataset,
num_replicas=args.world_size,
rank=rank,
)
### Create the dataloader
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=args.batch_size,
num_workers=0,
shuffle=False,
drop_last=True,
pin_memory=True,
sampler=sampler,
)
# Initialization
### Setup gpu device
device = torch.device('cuda', rank)
### Get log2 of the target image size
log_resolution = log2(args.img_size)
### Create discriminator
discriminator = Discriminator(log_resolution)
### Put the discriminator to the device
discriminator.to(device)
### Apply DDP
discriminator = nn.parallel.DistributedDataParallel(discriminator, device_ids=[gpu])
### Create discriminator loss
discriminator_loss = DiscriminatorLoss().to(device)
### Create discriminator optimizer
discriminator_optimizer = torch.optim.Adam(
discriminator.parameters(),
lr = args.d_lr,
betas = args.adam_betas,
)
### Create gradient penalty (gp) loss
gradient_penalty = GradientPenalty()
### Create generator
generator = Generator(device, log_resolution, args.g_latent_dim, args.style_mixing_prob)
### Put the generator to the device
generator.to(device)
### Apply DDP
generator = nn.parallel.DistributedDataParallel(generator, device_ids=[gpu])
### Create generator loss
generator_loss = GeneratorLoss().to(device)
### Create generator optimizer
generator_optimizer = torch.optim.Adam(
generator.parameters(),
lr = args.g_lr,
betas = args.adam_betas,
)
### Create path length penalty (PLP) loss
path_length_penalty = PathLengthPenalty(0.99).to(device)
### Create mapping network
mapping_network = MappingNetwork(args.g_latent_dim, args.mn_num_layers)
### Put the mapping network to the device
mapping_network.to(device)
### Apply DDP
mapping_network = nn.parallel.DistributedDataParallel(mapping_network, device_ids=[gpu])
### Create mapping network optimizer
mapping_network_optimizer = torch.optim.Adam(
mapping_network.parameters(),
lr = args.mn_lr,
betas = args.adam_betas,
)
generate_img_after = int(args.epochs * args.generate_img_after_percent)
# Measure time
torch.cuda.synchronize()
end_init = time.time()
init_time = end_init - start_init
print(f'Init_time: {init_time} sec')
# Training steps and losses tracking
disc_loss_y = []
gen_loss_y = []
# Measure time
times = []
for i in range(args.epochs):
start_epoch = time.time()
disc_loss, gen_loss = step(
i,
device,
args.batch_size,
dataloader,
args.gradient_accumulate_steps,
args.style_mixing_prob,
discriminator,
discriminator_loss,
discriminator_optimizer,
gradient_penalty,
args.gradient_penalty_coefficient,
args.lazy_gradient_penalty_interval,
generator,
generator_loss,
generator_optimizer,
path_length_penalty,
args.g_latent_dim,
args.lazy_path_penalty_after,
args.lazy_path_penalty_interval,
mapping_network,
mapping_network_optimizer,
args.model_dir,
args.generate_img_interval,
generate_img_after,
)
# Measure time
torch.cuda.synchronize()
end_epoch = time.time()
elapsed = end_epoch - start_epoch
times.append(elapsed)
print(f'EP{i}_elapsed_time: {elapsed} sec')
### Append losses of each step into the lists
disc_loss_y.append(disc_loss)
gen_loss_y.append(gen_loss)
# Measure time
avg_time = sum(times)/args.epochs
print(f'avg_time: {avg_time} sec')
### Plot the losses
epoch_x = np.linspace(1, args.epochs, args.epochs).astype(int)
plt.plot(epoch_x, disc_loss_y, label='disc_loss')
plt.plot(epoch_x, gen_loss_y, label='gen_loss')
plt.legend()
plt.savefig(f'{img_dir_path}/loss.png')
parallel-processing
pytorch
distributed-system
0 Answers
Your Answer