@ -5,6 +5,7 @@ import torch.nn.functional as F
import torch . optim as optim
import torch . optim . lr_scheduler as lr_scheduler
import torch . utils . data
from torch . cuda import amp
from torch . nn . parallel import DistributedDataParallel as DDP
from torch . utils . tensorboard import SummaryWriter
@ -14,13 +15,6 @@ from utils import google_utils
from utils . datasets import *
from utils . utils import *
mixed_precision = True
try : # Mixed precision training https://github.com/NVIDIA/apex
from apex import amp
except :
print ( ' Apex recommended for faster mixed precision training: https://github.com/NVIDIA/apex ' )
mixed_precision = False # not installed
# Hyperparameters
hyp = { ' optimizer ' : ' SGD ' , # ['adam', 'SGD', None] if none, default is SGD
' lr0 ' : 0.01 , # initial learning rate (SGD=1E-2, Adam=1E-3)
@ -63,6 +57,7 @@ def train(hyp, tb_writer, opt, device):
yaml . dump ( vars ( opt ) , f , sort_keys = False )
# Configure
cuda = device . type != ' cpu '
init_seeds ( 2 + rank )
with open ( opt . data ) as f :
data_dict = yaml . load ( f , Loader = yaml . FullLoader ) # model dict
@ -113,7 +108,7 @@ def train(hyp, tb_writer, opt, device):
optimizer . add_param_group ( { ' params ' : pg2 } ) # add pg2 (biases)
print ( ' Optimizer groups: %g .bias, %g conv.weight, %g other ' % ( len ( pg2 ) , len ( pg1 ) , len ( pg0 ) ) )
del pg0 , pg1 , pg2
# Scheduler https://arxiv.org/pdf/1812.01187.pdf
lf = lambda x : ( ( ( 1 + math . cos ( x * math . pi / epochs ) ) / 2 ) * * 1.0 ) * 0.8 + 0.2 # cosine
scheduler = lr_scheduler . LambdaLR ( optimizer , lr_lambda = lf )
@ -160,16 +155,12 @@ def train(hyp, tb_writer, opt, device):
del ckpt
# Mixed precision training https://github.com/NVIDIA/apex
if mixed_precision :
model , optimizer = amp . initialize ( model , optimizer , opt_level = ' O1 ' , verbosity = 0 )
# DP mode
if device. type != ' cpu ' and rank == - 1 and torch . cuda . device_count ( ) > 1 :
if cuda and rank == - 1 and torch . cuda . device_count ( ) > 1 :
model = torch . nn . DataParallel ( model )
# SyncBatchNorm
if opt . sync_bn and device. type != ' cpu ' and rank != - 1 :
if opt . sync_bn and cuda and rank != - 1 :
model = torch . nn . SyncBatchNorm . convert_sync_batchnorm ( model ) . to ( device )
print ( ' Using SyncBatchNorm() ' )
@ -177,7 +168,7 @@ def train(hyp, tb_writer, opt, device):
ema = torch_utils . ModelEMA ( model ) if rank in [ - 1 , 0 ] else None
# DDP mode
if device. type != ' cpu ' and rank != - 1 :
if cuda and rank != - 1 :
model = DDP ( model , device_ids = [ rank ] , output_device = rank )
# Trainloader
@ -223,6 +214,7 @@ def train(hyp, tb_writer, opt, device):
maps = np . zeros ( nc ) # mAP per class
results = ( 0 , 0 , 0 , 0 , 0 , 0 , 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
scheduler . last_epoch = start_epoch - 1 # do not move
scaler = amp . GradScaler ( enabled = cuda )
if rank in [ 0 , - 1 ] :
print ( ' Image sizes %g train, %g test ' % ( imgsz , imgsz_test ) )
print ( ' Using %g dataloader workers ' % dataloader . num_workers )
@ -232,15 +224,14 @@ def train(hyp, tb_writer, opt, device):
model . train ( )
# Update image weights (optional)
# When in DDP mode, the generated indices will be broadcasted to synchronize dataset.
if dataset . image_weights :
# Generate indices .
# Generate indices
if rank in [ - 1 , 0 ] :
w = model . class_weights . cpu ( ) . numpy ( ) * ( 1 - maps ) * * 2 # class weights
image_weights = labels_to_image_weights ( dataset . labels , nc = nc , class_weights = w )
dataset . indices = random . choices ( range ( dataset . n ) , weights = image_weights ,
k = dataset . n ) # rand weighted idx
# Broadcast .
# Broadcast if DDP
if rank != - 1 :
indices = torch . zeros ( [ dataset . n ] , dtype = torch . int )
if rank == 0 :
@ -263,7 +254,7 @@ def train(hyp, tb_writer, opt, device):
optimizer . zero_grad ( )
for i , ( imgs , targets , paths , _ ) in pbar : # batch -------------------------------------------------------------
ni = i + nb * epoch # number integrated batches (since train start)
imgs = imgs . to ( device , non_blocking = True ) . float ( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0
imgs = imgs . to ( device , non_blocking = True ) . float ( ) / 255.0 # uint8 to float32, 0 -255 to 0.0-1.0
# Warmup
if ni < = nw :
@ -284,27 +275,26 @@ def train(hyp, tb_writer, opt, device):
ns = [ math . ceil ( x * sf / gs ) * gs for x in imgs . shape [ 2 : ] ] # new shape (stretched to gs-multiple)
imgs = F . interpolate ( imgs , size = ns , mode = ' bilinear ' , align_corners = False )
# Forward
pred = model ( imgs )
# Autocast
with amp . autocast ( ) :
# Forward
pred = model ( imgs )
# Loss
loss , loss_items = compute_loss ( pred , targets . to ( device ) , model ) # scaled by batch_size
if rank != - 1 :
loss * = opt . world_size # gradient averaged between devices in DDP mode
if not torch . isfinite ( loss ) :
print ( ' WARNING: non-finite loss, ending training ' , loss_items )
return results
# Loss
loss , loss_items = compute_loss ( pred , targets . to ( device ) , model ) # scaled by batch_size
if rank != - 1 :
loss * = opt . world_size # gradient averaged between devices in DDP mode
# if not torch.isfinite(loss) :
# print('WARNING: non-finite loss, ending training ', loss_items )
# return results
# Backward
if mixed_precision :
with amp . scale_loss ( loss , optimizer ) as scaled_loss :
scaled_loss . backward ( )
else :
loss . backward ( )
scaler . scale ( loss ) . backward ( )
# Optimize
if ni % accumulate == 0 :
optimizer . step ( )
scaler . step ( optimizer ) # optimizer.step
scaler . update ( )
optimizer . zero_grad ( )
if ema is not None :
ema . update ( model )
@ -312,7 +302,7 @@ def train(hyp, tb_writer, opt, device):
# Print
if rank in [ - 1 , 0 ] :
mloss = ( mloss * i + loss_items ) / ( i + 1 ) # update mean losses
mem = ' %.3g G ' % ( torch . cuda . memory_ cach ed( ) / 1E9 if torch . cuda . is_available ( ) else 0 ) # (GB)
mem = ' %.3g G ' % ( torch . cuda . memory_ reserv ed( ) / 1E9 if torch . cuda . is_available ( ) else 0 ) # (GB)
s = ( ' %10s ' * 2 + ' %10.4g ' * 6 ) % (
' %g / %g ' % ( epoch , epochs - 1 ) , mem , * mloss , targets . shape [ 0 ] , imgs . shape [ - 1 ] )
pbar . set_description ( s )
@ -330,7 +320,7 @@ def train(hyp, tb_writer, opt, device):
# Scheduler
scheduler . step ( )
# Only the first process in DDP mode is allowed to log or save checkpoints.
# DDP process 0 or single-GPU
if rank in [ - 1 , 0 ] :
# mAP
if ema is not None :
@ -377,7 +367,7 @@ def train(hyp, tb_writer, opt, device):
# Save last, best and delete
torch . save ( ckpt , last )
if best_fitness == fi :
if best_fitness == fi :
torch . save ( ckpt , best )
del ckpt
# end epoch ----------------------------------------------------------------------------------------------------
@ -429,10 +419,12 @@ if __name__ == '__main__':
parser . add_argument ( ' --local_rank ' , type = int , default = - 1 , help = ' DDP parameter, do not modify ' )
opt = parser . parse_args ( )
# Resume
last = get_latest_run ( ) if opt . resume == ' get_last ' else opt . resume # resume from most recent run
if last and not opt . weights :
print ( f ' Resuming training from { last } ' )
opt . weights = last if opt . resume and not opt . weights else opt . weights
if opt . local_rank in [ - 1 , 0 ] :
check_git_status ( )
opt . cfg = check_file ( opt . cfg ) # check file
@ -442,21 +434,20 @@ if __name__ == '__main__':
with open ( opt . hyp ) as f :
hyp . update ( yaml . load ( f , Loader = yaml . FullLoader ) ) # update hyps
opt . img_size . extend ( [ opt . img_size [ - 1 ] ] * ( 2 - len ( opt . img_size ) ) ) # extend to 2 sizes (train, test)
device = torch_utils . select_device ( opt . device , apex= mixed_precision , batch_size= opt . batch_size )
device = torch_utils . select_device ( opt . device , batch_size= opt . batch_size )
opt . total_batch_size = opt . batch_size
opt . world_size = 1
if device . type == ' cpu ' :
mixed_precision = False
elif opt . local_rank != - 1 :
# DDP mode
# DDP mode
if opt . local_rank != - 1 :
assert torch . cuda . device_count ( ) > opt . local_rank
torch . cuda . set_device ( opt . local_rank )
device = torch . device ( " cuda " , opt . local_rank )
dist . init_process_group ( backend = ' nccl ' , init_method = ' env:// ' ) # distributed backend
opt . world_size = dist . get_world_size ( )
assert opt . batch_size % opt . world_size == 0 , " Batch size is not a multiple of the number of devices given! "
opt . batch_size = opt . total_batch_size / / opt . world_size
print ( opt )
# Train
@ -466,11 +457,12 @@ if __name__ == '__main__':
tb_writer = SummaryWriter ( log_dir = increment_dir ( ' runs/exp ' , opt . name ) )
else :
tb_writer = None
train ( hyp , tb_writer , opt , device )
# Evolve hyperparameters (optional)
else :
assert opt . local_rank == - 1 , " DDP mode currently not implemented for Evolve! "
assert opt . local_rank == - 1 , ' DDP mode not implemented for --evolve '
tb_writer = None
opt . notest , opt . nosave = True , True # only test/save final epoch