From 16f6834486a2a15a1b25c48042eeb1d8ce3841f8 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 8 Jul 2020 14:23:34 -0700 Subject: [PATCH] update train.py and experimental.py --- models/experimental.py | 5 ++++- train.py | 22 ++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/models/experimental.py b/models/experimental.py index 32a88f2..a22f6bb 100644 --- a/models/experimental.py +++ b/models/experimental.py @@ -119,7 +119,10 @@ class Ensemble(nn.ModuleList): y = [] for module in self: y.append(module(x, augment)[0]) - return torch.cat(y, 1), None # ensembled inference output, train output + # y = torch.stack(y).max(0)[0] # max ensemble + # y = torch.cat(y, 1) # nms ensemble + y = torch.stack(y).mean(0) # mean ensemble + return y, None # inference, train output def attempt_load(weights, map_location=None): diff --git a/train.py b/train.py index 61ed84e..c97e96b 100644 --- a/train.py +++ b/train.py @@ -101,11 +101,13 @@ def train(hyp): optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) + print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) + del pg0, pg1, pg2 + # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) - print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) - del pg0, pg1, pg2 + # plot_lr_scheduler(optimizer, scheduler, epochs) # Load Model google_utils.attempt_download(weights) @@ -147,12 +149,7 @@ def train(hyp): if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) - - scheduler.last_epoch = start_epoch - 1 # do not move - # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 - # plot_lr_scheduler(optimizer, scheduler, epochs) - - # Initialize distributed training + # Distributed training if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): dist.init_process_group(backend='nccl', # distributed backend init_method='tcp://127.0.0.1:9999', # init method @@ -198,9 +195,10 @@ def train(hyp): # Start training t0 = time.time() nb = len(dataloader) # number of batches - n_burn = max(3 * nb, 1e3) # burn-in iterations, max(3 epochs, 1k iterations) + nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' + scheduler.last_epoch = start_epoch - 1 # do not move print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) @@ -225,9 +223,9 @@ def train(hyp): ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 - # Burn-in - if ni <= n_burn: - xi = [0, n_burn] # x interp + # Warmup + if ni <= nw: + xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups):