From c020875b170c7f02978b33dac83937fd59510eee Mon Sep 17 00:00:00 2001 From: Liu Changyu Date: Sat, 1 Aug 2020 01:52:45 +0800 Subject: [PATCH] PyTorch 1.6.0 update with native AMP (#573) * PyTorch have Automatic Mixed Precision (AMP) Training. * Fixed the problem of inconsistent code length indentation * Fixed the problem of inconsistent code length indentation * Mixed precision training is turned on by default --- train.py | 80 ++++++++++++++++++++------------------------ utils/torch_utils.py | 4 +-- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/train.py b/train.py index 513abb0..b92e3e7 100644 --- a/train.py +++ b/train.py @@ -5,6 +5,7 @@ import torch.nn.functional as F import torch.optim as optim import torch.optim.lr_scheduler as lr_scheduler import torch.utils.data +from torch.cuda import amp from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -14,13 +15,6 @@ from utils import google_utils from utils.datasets import * from utils.utils import * -mixed_precision = True -try: # Mixed precision training https://github.com/NVIDIA/apex - from apex import amp -except: - print('Apex recommended for faster mixed precision training: https://github.com/NVIDIA/apex') - mixed_precision = False # not installed - # Hyperparameters hyp = {'optimizer': 'SGD', # ['adam', 'SGD', None] if none, default is SGD 'lr0': 0.01, # initial learning rate (SGD=1E-2, Adam=1E-3) @@ -63,6 +57,7 @@ def train(hyp, tb_writer, opt, device): yaml.dump(vars(opt), f, sort_keys=False) # Configure + cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict @@ -113,7 +108,7 @@ def train(hyp, tb_writer, opt, device): optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 - + # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) @@ -160,16 +155,12 @@ def train(hyp, tb_writer, opt, device): del ckpt - # Mixed precision training https://github.com/NVIDIA/apex - if mixed_precision: - model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) - # DP mode - if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1: + if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm - if opt.sync_bn and device.type != 'cpu' and rank != -1: + if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') @@ -177,7 +168,7 @@ def train(hyp, tb_writer, opt, device): ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None # DDP mode - if device.type != 'cpu' and rank != -1: + if cuda and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader @@ -223,6 +214,7 @@ def train(hyp, tb_writer, opt, device): maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move + scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) @@ -232,15 +224,14 @@ def train(hyp, tb_writer, opt, device): model.train() # Update image weights (optional) - # When in DDP mode, the generated indices will be broadcasted to synchronize dataset. if dataset.image_weights: - # Generate indices. + # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx - # Broadcast. + # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: @@ -263,7 +254,7 @@ def train(hyp, tb_writer, opt, device): optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) - imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 + imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: @@ -284,27 +275,26 @@ def train(hyp, tb_writer, opt, device): ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) - # Forward - pred = model(imgs) + # Autocast + with amp.autocast(): + # Forward + pred = model(imgs) - # Loss - loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size - if rank != -1: - loss *= opt.world_size # gradient averaged between devices in DDP mode - if not torch.isfinite(loss): - print('WARNING: non-finite loss, ending training ', loss_items) - return results + # Loss + loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size + if rank != -1: + loss *= opt.world_size # gradient averaged between devices in DDP mode + # if not torch.isfinite(loss): + # print('WARNING: non-finite loss, ending training ', loss_items) + # return results # Backward - if mixed_precision: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() + scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: - optimizer.step() + scaler.step(optimizer) # optimizer.step + scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) @@ -312,7 +302,7 @@ def train(hyp, tb_writer, opt, device): # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses - mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) + mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) @@ -330,7 +320,7 @@ def train(hyp, tb_writer, opt, device): # Scheduler scheduler.step() - # Only the first process in DDP mode is allowed to log or save checkpoints. + # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: @@ -377,7 +367,7 @@ def train(hyp, tb_writer, opt, device): # Save last, best and delete torch.save(ckpt, last) - if best_fitness == fi: + if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- @@ -429,10 +419,12 @@ if __name__ == '__main__': parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') opt = parser.parse_args() + # Resume last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run if last and not opt.weights: print(f'Resuming training from {last}') opt.weights = last if opt.resume and not opt.weights else opt.weights + if opt.local_rank in [-1, 0]: check_git_status() opt.cfg = check_file(opt.cfg) # check file @@ -442,21 +434,20 @@ if __name__ == '__main__': with open(opt.hyp) as f: hyp.update(yaml.load(f, Loader=yaml.FullLoader)) # update hyps opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) - device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size) + device = torch_utils.select_device(opt.device, batch_size=opt.batch_size) opt.total_batch_size = opt.batch_size opt.world_size = 1 - if device.type == 'cpu': - mixed_precision = False - elif opt.local_rank != -1: - # DDP mode + + # DDP mode + if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device("cuda", opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend - opt.world_size = dist.get_world_size() assert opt.batch_size % opt.world_size == 0, "Batch size is not a multiple of the number of devices given!" opt.batch_size = opt.total_batch_size // opt.world_size + print(opt) # Train @@ -466,11 +457,12 @@ if __name__ == '__main__': tb_writer = SummaryWriter(log_dir=increment_dir('runs/exp', opt.name)) else: tb_writer = None + train(hyp, tb_writer, opt, device) # Evolve hyperparameters (optional) else: - assert opt.local_rank == -1, "DDP mode currently not implemented for Evolve!" + assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' tb_writer = None opt.notest, opt.nosave = True, True # only test/save final epoch diff --git a/utils/torch_utils.py b/utils/torch_utils.py index 51ef984..f61d29e 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -22,7 +22,7 @@ def init_seeds(seed=0): cudnn.benchmark = True -def select_device(device='', apex=False, batch_size=None): +def select_device(device='', batch_size=None): # device = 'cpu' or '0' or '0,1,2,3' cpu_request = device.lower() == 'cpu' if device and not cpu_request: # if device requested other than 'cpu' @@ -36,7 +36,7 @@ def select_device(device='', apex=False, batch_size=None): if ng > 1 and batch_size: # check that batch_size is compatible with device_count assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng) x = [torch.cuda.get_device_properties(i) for i in range(ng)] - s = 'Using CUDA ' + ('Apex ' if apex else '') # apex for mixed precision https://github.com/NVIDIA/apex + s = 'Using CUDA ' for i in range(0, ng): if i == 1: s = ' ' * len(s)