From 260b1729f095188ca46f5a478699abaa1684726c Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 14 Jun 2020 18:32:29 -0700 Subject: [PATCH] FP16 inference update --- detect.py | 14 ++++---------- requirements.txt | 6 +++--- test.py | 26 +++++++++++++++++--------- utils/utils.py | 5 ++++- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/detect.py b/detect.py index 66f1522..fa67269 100644 --- a/detect.py +++ b/detect.py @@ -14,6 +14,7 @@ def detect(save_img=False): if os.path.exists(out): shutil.rmtree(out) # delete output folder os.makedirs(out) # make new output folder + half &= device.type != 'cpu' # half precision only supported on CUDA # Load model google_utils.attempt_download(weights) @@ -21,6 +22,8 @@ def detect(save_img=False): # torch.save(torch.load(weights, map_location=device), weights) # update model if SourceChangeWarning # model.fuse() model.to(device).eval() + if half: + model.half() # to FP16 # Second-stage classifier classify = False @@ -29,11 +32,6 @@ def detect(save_img=False): modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights modelc.to(device).eval() - # Half precision - half = half and device.type != 'cpu' # half precision only supported on CUDA - if half: - model.half() - # Set Dataloader vid_path, vid_writer = None, None if webcam: @@ -51,7 +49,7 @@ def detect(save_img=False): # Run inference t0 = time.time() img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img - _ = model(img.half() if half else img.float()) if device.type != 'cpu' else None # run once + _ = model(img.half() if half else img) if device.type != 'cpu' else None # run once for path, img, im0s, vid_cap in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 @@ -63,10 +61,6 @@ def detect(save_img=False): t1 = torch_utils.time_synchronized() pred = model(img, augment=opt.augment)[0] - # to float - if half: - pred = pred.float() - # Apply NMS pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, fast=True, classes=opt.classes, agnostic=opt.agnostic_nms) diff --git a/requirements.txt b/requirements.txt index 5d38f93..30660e9 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ # pip install -U -r requirements.txt Cython -numpy +numpy==1.17 opencv-python -torch >= 1.5 +torch>=1.5 matplotlib pillow tensorboard -pyyaml >= 5.3 +PyYAML>=5.3 torchvision scipy tqdm diff --git a/test.py b/test.py index 3b52be2..646e93b 100644 --- a/test.py +++ b/test.py @@ -20,10 +20,12 @@ def test(data, model=None, dataloader=None, fast=False, - verbose=False): # 0 fast, 1 accurate + verbose=False, + half=False): # FP16 # Initialize/load model and set device if model is None: device = torch_utils.select_device(opt.device, batch_size=batch_size) + half &= device.type != 'cpu' # half precision only supported on CUDA # Remove previous for f in glob.glob('test_batch*.jpg'): @@ -35,6 +37,8 @@ def test(data, torch_utils.model_info(model) # model.fuse() model.to(device) + if half: + model.half() # to FP16 if device.type != 'cpu' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) @@ -72,24 +76,27 @@ def test(data, seen = 0 model.eval() - _ = model(torch.zeros((1, 3, imgsz, imgsz), device=device)) if device.type != 'cpu' else None # run once + img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img + _ = model(img.half() if half else img) if device.type != 'cpu' else None # run once names = model.names if hasattr(model, 'names') else model.module.names coco91class = coco80_to_coco91_class() s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95') p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0. loss = torch.zeros(3, device=device) jdict, stats, ap, ap_class = [], [], [], [] - for batch_i, (imgs, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): - imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 + for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): + img = img.to(device) + img = img.half() if half else img.float() # uint8 to fp16/32 + img /= 255.0 # 0 - 255 to 0.0 - 1.0 targets = targets.to(device) - nb, _, height, width = imgs.shape # batch size, channels, height, width + nb, _, height, width = img.shape # batch size, channels, height, width whwh = torch.Tensor([width, height, width, height]).to(device) # Disable gradients with torch.no_grad(): # Run model t = torch_utils.time_synchronized() - inf_out, train_out = model(imgs, augment=augment) # inference and training outputs + inf_out, train_out = model(img, augment=augment) # inference and training outputs t0 += torch_utils.time_synchronized() - t # Compute loss @@ -125,7 +132,7 @@ def test(data, # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... image_id = int(Path(paths[si]).stem.split('_')[-1]) box = pred[:, :4].clone() # xyxy - scale_coords(imgs[si].shape[1:], box, shapes[si][0], shapes[si][1]) # to original shape + scale_coords(img[si].shape[1:], box, shapes[si][0], shapes[si][1]) # to original shape box = xyxy2xywh(box) # xywh box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner for p, b in zip(pred.tolist(), box.tolist()): @@ -168,9 +175,9 @@ def test(data, # Plot images if batch_i < 1: f = 'test_batch%g_gt.jpg' % batch_i # filename - plot_images(imgs, targets, paths, f, names) # ground truth + plot_images(img, targets, paths, f, names) # ground truth f = 'test_batch%g_pred.jpg' % batch_i - plot_images(imgs, output_to_target(output, width, height), paths, f, names) # predictions + plot_images(img, output_to_target(output, width, height), paths, f, names) # predictions # Compute statistics stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy @@ -241,6 +248,7 @@ if __name__ == '__main__': parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file') parser.add_argument('--task', default='val', help="'val', 'test', 'study'") parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--half', action='store_true', help='half precision FP16 inference') parser.add_argument('--single-cls', action='store_true', help='treat as single-class dataset') parser.add_argument('--augment', action='store_true', help='augmented inference') parser.add_argument('--verbose', action='store_true', help='report mAP by class') diff --git a/utils/utils.py b/utils/utils.py index 122d107..860dfe5 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -504,6 +504,9 @@ def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, fast=False, c Returns detections with shape: nx6 (x1, y1, x2, y2, conf, cls) """ + if prediction.dtype is torch.float16: + prediction = prediction.float() # to FP32 + nc = prediction[0].shape[1] - 5 # number of classes xc = prediction[..., 4] > conf_thres # candidates @@ -902,7 +905,7 @@ def plot_images(images, targets, paths=None, fname='images.jpg', names=None, max return None if isinstance(images, torch.Tensor): - images = images.cpu().numpy() + images = images.cpu().float().numpy() if isinstance(targets, torch.Tensor): targets = targets.cpu().numpy()