You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wwcs-1314/src/DeepSORT_YOLOv5_Pytorch-master/main.py

264 lines
11 KiB

from yolov5.utils.general import (
check_img_size, non_max_suppression, scale_coords, xyxy2xywh)
from yolov5.utils.torch_utils import select_device, time_synchronized
from yolov5.utils.datasets import letterbox
from utils_ds.parser import get_config
from utils_ds.draw import draw_boxes
from deep_sort import build_tracker
import argparse
import os
import time
import numpy as np
import warnings
import cv2
import torch
import torch.backends.cudnn as cudnn
import sys
currentUrl = os.path.dirname(__file__)
sys.path.append(os.path.abspath(os.path.join(currentUrl, 'yolov5')))
cudnn.benchmark = True
class VideoTracker(object):
def __init__(self, args):
print('Initialize DeepSORT & YOLO-V5')
# ***************** Initialize ******************************************************
self.args = args
self.img_size = args.img_size # image size in detector, default is 640
self.frame_interval = args.frame_interval # frequency
self.device = select_device(args.device)
self.half = self.device.type != 'cpu' # half precision only supported on CUDA
# create video capture ****************
if args.display:
cv2.namedWindow("test", cv2.WINDOW_NORMAL)
cv2.resizeWindow("test", args.display_width, args.display_height)
if args.cam != -1:
print("Using webcam " + str(args.cam))
self.vdo = cv2.VideoCapture(args.cam)
if not self.vdo.isOpened():
raise ValueError(f"Error opening camera {args.cam}")
else:
self.vdo = cv2.VideoCapture()
# ***************************** initialize DeepSORT **********************************
cfg = get_config()
cfg.merge_from_file(args.config_deepsort)
use_cuda = self.device.type != 'cpu' and torch.cuda.is_available()
self.deepsort = build_tracker(cfg, use_cuda=use_cuda)
# ***************************** initialize YOLO-V5 **********************************
self.detector = torch.load(args.weights, map_location=self.device)['model'].float() # load to FP32
self.detector.to(self.device).eval()
if self.half:
self.detector.half() # to FP16
self.names = self.detector.module.names if hasattr(self.detector, 'module') else self.detector.names
print('Done..')
if self.device == 'cpu':
warnings.warn("Running in cpu mode which maybe very slow!", UserWarning)
def __enter__(self):
# ************************* Load video from camera *************************
if self.args.cam != -1:
print('Camera ...')
ret, frame = self.vdo.read()
assert ret, "Error: Camera error"
self.im_width = int(self.vdo.get(cv2.CAP_PROP_FRAME_WIDTH))
self.im_height = int(self.vdo.get(cv2.CAP_PROP_FRAME_HEIGHT))
# ************************* Load video from file *************************
else:
assert os.path.isfile(self.args.input_path), "Path error"
self.vdo.open(self.args.input_path)
self.im_width = int(self.vdo.get(cv2.CAP_PROP_FRAME_WIDTH))
self.im_height = int(self.vdo.get(cv2.CAP_PROP_FRAME_HEIGHT))
assert self.vdo.isOpened()
print('Done. Load video file ', self.args.input_path)
# ************************* create output *************************
if self.args.save_path:
os.makedirs(self.args.save_path, exist_ok=True)
# path of saved video and results
self.save_video_path = os.path.join(self.args.save_path, "results.mp4")
# create video writer
fourcc = cv2.VideoWriter_fourcc(*self.args.fourcc)
self.writer = cv2.VideoWriter(self.save_video_path, fourcc,
self.vdo.get(cv2.CAP_PROP_FPS), (self.im_width, self.im_height))
print('Done. Create output file ', self.save_video_path)
if self.args.save_txt:
os.makedirs(self.args.save_txt, exist_ok=True)
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self.vdo.release()
self.writer.release()
if exc_type:
print(exc_type, exc_value, exc_traceback)
def run(self):
yolo_time, sort_time, avg_fps = [], [], []
t_start = time.time()
idx_frame = 0
last_out = None
while self.vdo.grab():
# Inference *********************************************************************
t0 = time.time()
_, img0 = self.vdo.retrieve()
if idx_frame % self.args.frame_interval == 0:
outputs, yt, st = self.image_track(img0) # (#ID, 5) x1,y1,x2,y2,id
last_out = outputs
yolo_time.append(yt)
sort_time.append(st)
print('Frame %d Done. YOLO-time:(%.3fs) SORT-time:(%.3fs)' % (idx_frame, yt, st))
else:
outputs = last_out # directly use prediction in last frames
t1 = time.time()
avg_fps.append(t1 - t0)
# post-processing ***************************************************************
# visualize bbox ********************************
if len(outputs) > 0:
bbox_xyxy = outputs[:, :4]
identities = outputs[:, -1]
img0 = draw_boxes(img0, bbox_xyxy, identities) # BGR
# add FPS information on output video
text_scale = max(1, img0.shape[1] // 1600)
cv2.putText(img0, 'frame: %d fps: %.2f ' % (idx_frame, len(avg_fps) / sum(avg_fps)),
(20, 20 + text_scale), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2)
# display on window ******************************
if self.args.display:
cv2.imshow("test", img0)
if cv2.waitKey(1) == ord('q'): # q to quit
cv2.destroyAllWindows()
break
# save to video file *****************************
if self.args.save_path:
self.writer.write(img0)
if self.args.save_txt:
with open(self.args.save_txt + str(idx_frame).zfill(4) + '.txt', 'a') as f:
for i in range(len(outputs)):
x1, y1, x2, y2, idx = outputs[i]
f.write('{}\t{}\t{}\t{}\t{}\n'.format(x1, y1, x2, y2, idx))
idx_frame += 1
print('Avg YOLO time (%.3fs), Sort time (%.3fs) per frame' % (sum(yolo_time) / len(yolo_time),
sum(sort_time)/len(sort_time)))
t_end = time.time()
print('Total time (%.3fs), Total Frame: %d' % (t_end - t_start, idx_frame))
def image_track(self, im0):
"""
:param im0: original image, BGR format
:return:
"""
# preprocess ************************************************************
# Padded resize
img = letterbox(im0, new_shape=self.img_size)[0]
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)
# numpy to tensor
img = torch.from_numpy(img).to(self.device)
img = img.half() if self.half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
s = '%gx%g ' % img.shape[2:] # print string
# Detection time *********************************************************
# Inference
t1 = time_synchronized()
with torch.no_grad():
pred = self.detector(img, augment=self.args.augment)[0] # list: bz * [ (#obj, 6)]
# Apply NMS and filter object other than person (cls:0)
pred = non_max_suppression(pred, self.args.conf_thres, self.args.iou_thres,
classes=self.args.classes, agnostic=self.args.agnostic_nms)
t2 = time_synchronized()
# get all obj ************************************************************
det = pred[0] # for video, bz is 1
if det is not None and len(det): # det: (#obj, 6) x1 y1 x2 y2 conf cls
# Rescale boxes from img_size to original im0 size
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
# Print results. statistics of number of each obj
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class
s += '%g %ss, ' % (n, self.names[int(c)]) # add to string
bbox_xywh = xyxy2xywh(det[:, :4]).cpu()
confs = det[:, 4:5].cpu()
# ****************************** deepsort ****************************
outputs = self.deepsort.update(bbox_xywh, confs, im0)
# (#ID, 5) x1,y1,x2,y2,track_ID
else:
outputs = torch.zeros((0, 5))
t3 = time.time()
return outputs, t2-t1, t3-t2
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# input and output
parser.add_argument('--input_path', type=str, default='input_480.mp4', help='source') # file/folder, 0 for webcam
parser.add_argument('--save_path', type=str, default='output/', help='output folder') # output folder
parser.add_argument("--frame_interval", type=int, default=2)
parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--save_txt', default='output/predict/', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
# camera only
parser.add_argument("--display", action="store_true")
parser.add_argument("--display_width", type=int, default=800)
parser.add_argument("--display_height", type=int, default=600)
parser.add_argument("--camera", action="store", dest="cam", type=int, default="-1")
# YOLO-V5 parameters
parser.add_argument('--weights', type=str, default='yolov5/weights/yolov5s.pt', help='model.pt path')
parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.5, help='IOU threshold for NMS')
parser.add_argument('--classes', nargs='+', type=int, default=[0], help='filter by class')
parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
parser.add_argument('--augment', action='store_true', help='augmented inference')
# deepsort parameters
parser.add_argument("--config_deepsort", type=str, default="./configs/deep_sort.yaml")
args = parser.parse_args()
args.img_size = check_img_size(args.img_size)
print(args)
with VideoTracker(args) as vdo_trk:
vdo_trk.run()