diff --git a/SECURITY.md b/SECURITY.md index 12fbd83..b40ff15 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,20 +1,16 @@ - + ## Security -Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [many more](https://opensource.microsoft.com/). -If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [definition](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. ## Reporting Security Issues -**Please do not report security vulnerabilities through public GitHub issues.** +**Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center at [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://technet.microsoft.com/en-us/security/dn606155). -Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). - -If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). - -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: @@ -28,8 +24,6 @@ Please include the requested information listed below (as much as you can provid This information will help us triage your report more quickly. -If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. - ## Preferred Languages We prefer all communications to be in English. diff --git a/build/Dockerfile b/build/Dockerfile new file mode 100644 index 0000000..ae3af52 --- /dev/null +++ b/build/Dockerfile @@ -0,0 +1,98 @@ +ARG IMAGE_NAME +FROM ${IMAGE_NAME}:10.2-runtime-ubuntu18.04 +LABEL maintainer "NVIDIA CORPORATION " + +RUN apt-get update && apt-get install -y --no-install-recommends \ + cuda-nvml-dev-$CUDA_PKG_VERSION \ + cuda-command-line-tools-$CUDA_PKG_VERSION \ +cuda-libraries-dev-$CUDA_PKG_VERSION \ + cuda-minimal-build-$CUDA_PKG_VERSION \ + libnccl-dev=$NCCL_VERSION-1+cuda10.2 \ +libcublas-dev=10.2.2.89-1 \ +&& \ + rm -rf /var/lib/apt/lists/* + +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs + +# Install some basic utilities +RUN apt-get update && apt-get install -y \ + curl \ + wget \ + build-essential \ + ca-certificates \ + sudo \ + git \ + bzip2 \ + libx11-6 \ + && rm -rf /var/lib/apt/lists/* + +# Create a working directory +RUN mkdir /app +WORKDIR /app + +# Create a non-root user and switch to it +RUN adduser --disabled-password --gecos '' --shell /bin/bash user \ + && chown -R user:user /app +RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user + +USER user + +# All users can use /home/user as their home directory +ENV HOME=/home/user +RUN chmod 777 /home/user + +# CT: 4/17 + + +# Install Miniconda +# RUN curl -so ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh \ + +RUN wget https://repo.continuum.io/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh \ + && chmod +x ~/miniconda.sh \ + && ~/miniconda.sh -b -p ~/miniconda \ + && rm ~/miniconda.sh +ENV PATH=/home/user/miniconda/bin:$PATH +ENV CONDA_AUTO_UPDATE_CONDA=false + +# Create a Python 3.6 environment +RUN /home/user/miniconda/bin/conda create -y --name py36 python=3.6.9 \ + && /home/user/miniconda/bin/conda clean -ya +ENV CONDA_DEFAULT_ENV=py36 +ENV CONDA_PREFIX=/home/user/miniconda/envs/$CONDA_DEFAULT_ENV +ENV PATH=$CONDA_PREFIX/bin:$PATH +RUN /home/user/miniconda/bin/conda install conda-build=3.18.9=py36_3 \ + && /home/user/miniconda/bin/conda clean -ya + +# CUDA 10.0-specific steps +RUN conda install -y -c pytorch \ + cudatoolkit=10.0 \ + "pytorch=1.2.0=py3.6_cuda10.0.130_cudnn7.6.2_0" \ + "torchvision=0.4.0=py36_cu100" \ + && conda clean -ya + +# Install HDF5 Python bindings +RUN conda install -y h5py=2.8.0 \ + && conda clean -ya +RUN pip install h5py-cache==1.0 + +# Install Torchnet, a high-level framework for PyTorch +RUN pip install torchnet==0.0.4 + +# Install Requests, a Python library for making HTTP requests +RUN conda install -y requests=2.19.1 \ + && conda clean -ya + +# Install Graphviz +RUN conda install -y graphviz=2.40.1 python-graphviz=0.8.4 \ + && conda clean -ya + +# Install OpenCV3 Python bindings +RUN sudo apt-get update && sudo apt-get install -y --no-install-recommends \ + libgtk2.0-0 \ + libcanberra-gtk-module \ + && sudo rm -rf /var/lib/apt/lists/* +RUN conda install -y -c menpo opencv3=3.1.0 \ + && conda clean -ya + +# Set the default command to python3 +CMD ["/bin/bash"] diff --git a/src/_init_paths.py b/src/_init_paths.py index 7d72f18..64ad58e 100644 --- a/src/_init_paths.py +++ b/src/_init_paths.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ import os.path as osp diff --git a/src/demo.py b/src/demo.py index 8730bf8..46755e9 100644 --- a/src/demo.py +++ b/src/demo.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ from __future__ import absolute_import diff --git a/src/gen_data_path.py b/src/gen_data_path.py index 3c64971..3692e5f 100644 --- a/src/gen_data_path.py +++ b/src/gen_data_path.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ import os diff --git a/src/gen_labels_15.py b/src/gen_labels_15.py index 164c7e3..0585320 100644 --- a/src/gen_labels_15.py +++ b/src/gen_labels_15.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ import os.path as osp diff --git a/src/gen_labels_20.py b/src/gen_labels_20.py index 8239525..a714729 100644 --- a/src/gen_labels_20.py +++ b/src/gen_labels_20.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ import os.path as osp diff --git a/src/lib/cfg/data.json b/src/lib/cfg/data.json new file mode 100644 index 0000000..f3f1c19 --- /dev/null +++ b/src/lib/cfg/data.json @@ -0,0 +1,20 @@ +{ + "root":"/data/yfzhang/MOT/JDE", + "train": + { + "mot17":"./data/mot17.train", + "caltech":"./data/caltech.train", + "citypersons":"./data/citypersons.train", + "cuhksysu":"./data/cuhksysu.train", + "prw":"./data/prw.train", + "eth":"./data/eth.train" + }, + "test_emb": + { + "mot15":"./data/mot15.val" + }, + "test": + { + "mot15":"./data/mot15.val" + } +} diff --git a/src/lib/cfg/mot15.json b/src/lib/cfg/mot15.json new file mode 100644 index 0000000..7f2bd6e --- /dev/null +++ b/src/lib/cfg/mot15.json @@ -0,0 +1,15 @@ +{ + "root":"/data/yfzhang/MOT/JDE", + "train": + { + "mot15":"./data/mot15.train" + }, + "test_emb": + { + "mot15":"./data/mot15.train" + }, + "test": + { + "mot15":"./data/mot15.train" + } +} diff --git a/src/lib/cfg/mot17.json b/src/lib/cfg/mot17.json new file mode 100644 index 0000000..6a708d9 --- /dev/null +++ b/src/lib/cfg/mot17.json @@ -0,0 +1,15 @@ +{ + "root":"/data/yfzhang/MOT/JDE", + "train": + { + "mot17":"./data/mot17.train" + }, + "test_emb": + { + "mot17":"./data/mot17.train" + }, + "test": + { + "mot17":"./data/mot17.train" + } +} diff --git a/src/lib/cfg/mot20.json b/src/lib/cfg/mot20.json new file mode 100644 index 0000000..3700278 --- /dev/null +++ b/src/lib/cfg/mot20.json @@ -0,0 +1,15 @@ +{ + "root":"/data/yfzhang/MOT/JDE", + "train": + { + "mot20":"./data/mot20.train" + }, + "test_emb": + { + "mot20":"./data/mot20.train" + }, + "test": + { + "mot20":"./data/mot20.train" + } +} diff --git a/src/lib/datasets/dataset/jde.py b/src/lib/datasets/dataset/jde.py new file mode 100644 index 0000000..590d95d --- /dev/null +++ b/src/lib/datasets/dataset/jde.py @@ -0,0 +1,539 @@ +import glob +import math +import os +import os.path as osp +import random +import time +from collections import OrderedDict + +import cv2 +import json +import numpy as np +import torch + +from torch.utils.data import Dataset +from torchvision.transforms import transforms as T +from cython_bbox import bbox_overlaps as bbox_ious +from opts import opts +from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian +from utils.utils import xyxy2xywh, generate_anchors, xywh2xyxy, encode_delta + + +class LoadImages: # for inference + def __init__(self, path, img_size=(1088, 608)): + if os.path.isdir(path): + image_format = ['.jpg', '.jpeg', '.png', '.tif'] + self.files = sorted(glob.glob('%s/*.*' % path)) + self.files = list(filter(lambda x: os.path.splitext(x)[1].lower() in image_format, self.files)) + elif os.path.isfile(path): + self.files = [path] + + self.nF = len(self.files) # number of image files + self.width = img_size[0] + self.height = img_size[1] + self.count = 0 + + assert self.nF > 0, 'No images found in ' + path + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + if self.count == self.nF: + raise StopIteration + img_path = self.files[self.count] + + # Read image + img0 = cv2.imread(img_path) # BGR + assert img0 is not None, 'Failed to load ' + img_path + + # Padded resize + img, _, _, _ = letterbox(img0, height=self.height, width=self.width) + + # Normalize RGB + img = img[:, :, ::-1].transpose(2, 0, 1) + img = np.ascontiguousarray(img, dtype=np.float32) + img /= 255.0 + + # cv2.imwrite(img_path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1]) # save letterbox image + return img_path, img, img0 + + def __getitem__(self, idx): + idx = idx % self.nF + img_path = self.files[idx] + + # Read image + img0 = cv2.imread(img_path) # BGR + assert img0 is not None, 'Failed to load ' + img_path + + # Padded resize + img, _, _, _ = letterbox(img0, height=self.height, width=self.width) + + # Normalize RGB + img = img[:, :, ::-1].transpose(2, 0, 1) + img = np.ascontiguousarray(img, dtype=np.float32) + img /= 255.0 + + return img_path, img, img0 + + def __len__(self): + return self.nF # number of files + + +class LoadVideo: # for inference + def __init__(self, path, img_size=(1088, 608)): + self.cap = cv2.VideoCapture(path) + self.frame_rate = int(round(self.cap.get(cv2.CAP_PROP_FPS))) + self.vw = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + self.vh = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + self.vn = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + self.width = img_size[0] + self.height = img_size[1] + self.count = 0 + + self.w, self.h = 1920, 1080 + print('Lenth of the video: {:d} frames'.format(self.vn)) + + def get_size(self, vw, vh, dw, dh): + wa, ha = float(dw) / vw, float(dh) / vh + a = min(wa, ha) + return int(vw * a), int(vh * a) + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + if self.count == len(self): + raise StopIteration + # Read image + res, img0 = self.cap.read() # BGR + assert img0 is not None, 'Failed to load frame {:d}'.format(self.count) + img0 = cv2.resize(img0, (self.w, self.h)) + + # Padded resize + img, _, _, _ = letterbox(img0, height=self.height, width=self.width) + + # Normalize RGB + img = img[:, :, ::-1].transpose(2, 0, 1) + img = np.ascontiguousarray(img, dtype=np.float32) + img /= 255.0 + + # cv2.imwrite(img_path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1]) # save letterbox image + return self.count, img, img0 + + def __len__(self): + return self.vn # number of files + + +class LoadImagesAndLabels: # for training + def __init__(self, path, img_size=(1088, 608), augment=False, transforms=None): + with open(path, 'r') as file: + self.img_files = file.readlines() + self.img_files = [x.replace('\n', '') for x in self.img_files] + self.img_files = list(filter(lambda x: len(x) > 0, self.img_files)) + + self.label_files = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') + for x in self.img_files] + + self.nF = len(self.img_files) # number of image files + self.width = img_size[0] + self.height = img_size[1] + self.augment = augment + self.transforms = transforms + + def __getitem__(self, files_index): + img_path = self.img_files[files_index] + label_path = self.label_files[files_index] + return self.get_data(img_path, label_path) + + def get_data(self, img_path, label_path): + height = self.height + width = self.width + img = cv2.imread(img_path) # BGR + if img is None: + raise ValueError('File corrupt {}'.format(img_path)) + augment_hsv = True + if self.augment and augment_hsv: + # SV augmentation by 50% + fraction = 0.50 + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + S = img_hsv[:, :, 1].astype(np.float32) + V = img_hsv[:, :, 2].astype(np.float32) + + a = (random.random() * 2 - 1) * fraction + 1 + S *= a + if a > 1: + np.clip(S, a_min=0, a_max=255, out=S) + + a = (random.random() * 2 - 1) * fraction + 1 + V *= a + if a > 1: + np.clip(V, a_min=0, a_max=255, out=V) + + img_hsv[:, :, 1] = S.astype(np.uint8) + img_hsv[:, :, 2] = V.astype(np.uint8) + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) + + h, w, _ = img.shape + img, ratio, padw, padh = letterbox(img, height=height, width=width) + + # Load labels + if os.path.isfile(label_path): + labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6) + + # Normalized xywh to pixel xyxy format + labels = labels0.copy() + labels[:, 2] = ratio * w * (labels0[:, 2] - labels0[:, 4] / 2) + padw + labels[:, 3] = ratio * h * (labels0[:, 3] - labels0[:, 5] / 2) + padh + labels[:, 4] = ratio * w * (labels0[:, 2] + labels0[:, 4] / 2) + padw + labels[:, 5] = ratio * h * (labels0[:, 3] + labels0[:, 5] / 2) + padh + else: + labels = np.array([]) + + # Augment image and labels + if self.augment: + img, labels, M = random_affine(img, labels, degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.50, 1.20)) + + plotFlag = False + if plotFlag: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + plt.figure(figsize=(50, 50)) + plt.imshow(img[:, :, ::-1]) + plt.plot(labels[:, [2, 4, 4, 2, 2]].T, labels[:, [3, 3, 5, 5, 3]].T, '.-') + plt.axis('off') + plt.savefig('test.jpg') + time.sleep(10) + + nL = len(labels) + if nL > 0: + # convert xyxy to xywh + labels[:, 2:6] = xyxy2xywh(labels[:, 2:6].copy()) # / height + labels[:, 2] /= width + labels[:, 3] /= height + labels[:, 4] /= width + labels[:, 5] /= height + if self.augment: + # random left-right flip + lr_flip = True + if lr_flip & (random.random() > 0.5): + img = np.fliplr(img) + if nL > 0: + labels[:, 2] = 1 - labels[:, 2] + + img = np.ascontiguousarray(img[:, :, ::-1]) # BGR to RGB + + if self.transforms is not None: + img = self.transforms(img) + + return img, labels, img_path, (h, w) + + def __len__(self): + return self.nF # number of batches + + +def letterbox(img, height=608, width=1088, + color=(127.5, 127.5, 127.5)): # resize a rectangular image to a padded rectangular + shape = img.shape[:2] # shape = [height, width] + ratio = min(float(height) / shape[0], float(width) / shape[1]) + new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height] + dw = (width - new_shape[0]) / 2 # width padding + dh = (height - new_shape[1]) / 2 # height padding + top, bottom = round(dh - 0.1), round(dh + 0.1) + left, right = round(dw - 0.1), round(dw + 0.1) + img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular + return img, ratio, dw, dh + + +def random_affine(img, targets=None, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), + borderValue=(127.5, 127.5, 127.5)): + # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) + # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 + + border = 0 # width of added border (optional) + height = img.shape[0] + width = img.shape[1] + + # Rotation and Scale + R = np.eye(3) + a = random.random() * (degrees[1] - degrees[0]) + degrees[0] + # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations + s = random.random() * (scale[1] - scale[0]) + scale[0] + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) + + # Translation + T = np.eye(3) + T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) + T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) + + M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! + imw = cv2.warpPerspective(img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, + borderValue=borderValue) # BGR order borderValue + + # Return warped points also + if targets is not None: + if len(targets) > 0: + n = targets.shape[0] + points = targets[:, 2:6].copy() + area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) + + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = (xy @ M.T)[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # apply angle-based reduction + radians = a * math.pi / 180 + reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 + x = (xy[:, 2] + xy[:, 0]) / 2 + y = (xy[:, 3] + xy[:, 1]) / 2 + w = (xy[:, 2] - xy[:, 0]) * reduction + h = (xy[:, 3] - xy[:, 1]) * reduction + xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T + + # reject warped points outside of image + np.clip(xy[:, 0], 0, width, out=xy[:, 0]) + np.clip(xy[:, 2], 0, width, out=xy[:, 2]) + np.clip(xy[:, 1], 0, height, out=xy[:, 1]) + np.clip(xy[:, 3], 0, height, out=xy[:, 3]) + w = xy[:, 2] - xy[:, 0] + h = xy[:, 3] - xy[:, 1] + area = w * h + ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) + i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) + + targets = targets[i] + targets[:, 2:6] = xy[i] + + return imw, targets, M + else: + return imw + + +def collate_fn(batch): + imgs, labels, paths, sizes = zip(*batch) + batch_size = len(labels) + imgs = torch.stack(imgs, 0) + max_box_len = max([l.shape[0] for l in labels]) + labels = [torch.from_numpy(l) for l in labels] + filled_labels = torch.zeros(batch_size, max_box_len, 6) + labels_len = torch.zeros(batch_size) + + for i in range(batch_size): + isize = labels[i].shape[0] + if len(labels[i]) > 0: + filled_labels[i, :isize, :] = labels[i] + labels_len[i] = isize + + return imgs, filled_labels, paths, sizes, labels_len.unsqueeze(1) + + +class JointDataset(LoadImagesAndLabels): # for training + default_resolution = [1088, 608] + mean = None + std = None + num_classes = 1 + + def __init__(self, opt, root, paths, img_size=(1088, 608), augment=False, transforms=None): + self.opt = opt + dataset_names = paths.keys() + self.img_files = OrderedDict() + self.label_files = OrderedDict() + self.tid_num = OrderedDict() + self.tid_start_index = OrderedDict() + self.num_classes = 1 + + for ds, path in paths.items(): + with open(path, 'r') as file: + self.img_files[ds] = file.readlines() + self.img_files[ds] = [osp.join(root, x.strip()) for x in self.img_files[ds]] + self.img_files[ds] = list(filter(lambda x: len(x) > 0, self.img_files[ds])) + + self.label_files[ds] = [ + x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') + for x in self.img_files[ds]] + + for ds, label_paths in self.label_files.items(): + max_index = -1 + for lp in label_paths: + lb = np.loadtxt(lp) + if len(lb) < 1: + continue + if len(lb.shape) < 2: + img_max = lb[1] + else: + img_max = np.max(lb[:, 1]) + if img_max > max_index: + max_index = img_max + self.tid_num[ds] = max_index + 1 + + last_index = 0 + for i, (k, v) in enumerate(self.tid_num.items()): + self.tid_start_index[k] = last_index + last_index += v + + self.nID = int(last_index + 1) + self.nds = [len(x) for x in self.img_files.values()] + self.cds = [sum(self.nds[:i]) for i in range(len(self.nds))] + self.nF = sum(self.nds) + self.width = img_size[0] + self.height = img_size[1] + self.max_objs = opt.K + self.augment = augment + self.transforms = transforms + + print('=' * 80) + print('dataset summary') + print(self.tid_num) + print('total # identities:', self.nID) + print('start index') + print(self.tid_start_index) + print('=' * 80) + + def __getitem__(self, files_index): + + for i, c in enumerate(self.cds): + if files_index >= c: + ds = list(self.label_files.keys())[i] + start_index = c + + img_path = self.img_files[ds][files_index - start_index] + label_path = self.label_files[ds][files_index - start_index] + + imgs, labels, img_path, (input_h, input_w) = self.get_data(img_path, label_path) + for i, _ in enumerate(labels): + if labels[i, 1] > -1: + labels[i, 1] += self.tid_start_index[ds] + + output_h = imgs.shape[1] // self.opt.down_ratio + output_w = imgs.shape[2] // self.opt.down_ratio + num_classes = self.num_classes + num_objs = labels.shape[0] + hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32) + wh = np.zeros((self.max_objs, 2), dtype=np.float32) + reg = np.zeros((self.max_objs, 2), dtype=np.float32) + ind = np.zeros((self.max_objs, ), dtype=np.int64) + reg_mask = np.zeros((self.max_objs, ), dtype=np.uint8) + ids = np.zeros((self.max_objs, ), dtype=np.int64) + + draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else draw_umich_gaussian + for k in range(num_objs): + label = labels[k] + bbox = label[2:] + cls_id = int(label[0]) + bbox[[0, 2]] = bbox[[0, 2]] * output_w + bbox[[1, 3]] = bbox[[1, 3]] * output_h + bbox[0] = np.clip(bbox[0], 0, output_w - 1) + bbox[1] = np.clip(bbox[1], 0, output_h - 1) + h = bbox[3] + w = bbox[2] + + if h > 0 and w > 0: + radius = gaussian_radius((math.ceil(h), math.ceil(w))) + radius = max(0, int(radius)) + radius = self.opt.hm_gauss if self.opt.mse_loss else radius + ct = np.array( + [bbox[0], bbox[1]], dtype=np.float32) + ct_int = ct.astype(np.int32) + draw_gaussian(hm[cls_id], ct_int, radius) + wh[k] = 1. * w, 1. * h + ind[k] = ct_int[1] * output_w + ct_int[0] + reg[k] = ct - ct_int + reg_mask[k] = 1 + ids[k] = label[1] + + ret = {'input': imgs, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh, 'reg': reg, 'ids': ids} + return ret + + +class DetDataset(LoadImagesAndLabels): # for training + def __init__(self, root, paths, img_size=(1088, 608), augment=False, transforms=None): + + dataset_names = paths.keys() + self.img_files = OrderedDict() + self.label_files = OrderedDict() + self.tid_num = OrderedDict() + self.tid_start_index = OrderedDict() + for ds, path in paths.items(): + with open(path, 'r') as file: + self.img_files[ds] = file.readlines() + self.img_files[ds] = [osp.join(root, x.strip()) for x in self.img_files[ds]] + self.img_files[ds] = list(filter(lambda x: len(x) > 0, self.img_files[ds])) + + self.label_files[ds] = [ + x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') + for x in self.img_files[ds]] + + for ds, label_paths in self.label_files.items(): + max_index = -1 + for lp in label_paths: + lb = np.loadtxt(lp) + if len(lb) < 1: + continue + if len(lb.shape) < 2: + img_max = lb[1] + else: + img_max = np.max(lb[:, 1]) + if img_max > max_index: + max_index = img_max + self.tid_num[ds] = max_index + 1 + + last_index = 0 + for i, (k, v) in enumerate(self.tid_num.items()): + self.tid_start_index[k] = last_index + last_index += v + + self.nID = int(last_index + 1) + self.nds = [len(x) for x in self.img_files.values()] + self.cds = [sum(self.nds[:i]) for i in range(len(self.nds))] + self.nF = sum(self.nds) + self.width = img_size[0] + self.height = img_size[1] + self.augment = augment + self.transforms = transforms + + print('=' * 80) + print('dataset summary') + print(self.tid_num) + print('total # identities:', self.nID) + print('start index') + print(self.tid_start_index) + print('=' * 80) + + def __getitem__(self, files_index): + + for i, c in enumerate(self.cds): + if files_index >= c: + ds = list(self.label_files.keys())[i] + start_index = c + + img_path = self.img_files[ds][files_index - start_index] + label_path = self.label_files[ds][files_index - start_index] + if os.path.isfile(label_path): + labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6) + + imgs, labels, img_path, (h, w) = self.get_data(img_path, label_path) + for i, _ in enumerate(labels): + if labels[i, 1] > -1: + labels[i, 1] += self.tid_start_index[ds] + + return imgs, labels0, img_path, (h, w) + + diff --git a/src/lib/datasets/dataset_factory.py b/src/lib/datasets/dataset_factory.py new file mode 100644 index 0000000..f310d52 --- /dev/null +++ b/src/lib/datasets/dataset_factory.py @@ -0,0 +1,17 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from .dataset.jde import JointDataset + + +def get_dataset(dataset, task): + if task == 'mot': + return JointDataset + else: + return None diff --git a/src/lib/logger.py b/src/lib/logger.py new file mode 100644 index 0000000..93ac561 --- /dev/null +++ b/src/lib/logger.py @@ -0,0 +1,77 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 +import os +import time +import sys +import torch +USE_TENSORBOARD = True +try: + import tensorboardX + print('Using tensorboardX') +except: + USE_TENSORBOARD = False + +class Logger(object): + def __init__(self, opt): + """Create a summary writer logging to log_dir.""" + if not os.path.exists(opt.save_dir): + os.makedirs(opt.save_dir) + if not os.path.exists(opt.debug_dir): + os.makedirs(opt.debug_dir) + + time_str = time.strftime('%Y-%m-%d-%H-%M') + + args = dict((name, getattr(opt, name)) for name in dir(opt) + if not name.startswith('_')) + file_name = os.path.join(opt.save_dir, 'opt.txt') + with open(file_name, 'wt') as opt_file: + opt_file.write('==> torch version: {}\n'.format(torch.__version__)) + opt_file.write('==> cudnn version: {}\n'.format( + torch.backends.cudnn.version())) + opt_file.write('==> Cmd:\n') + opt_file.write(str(sys.argv)) + opt_file.write('\n==> Opt:\n') + for k, v in sorted(args.items()): + opt_file.write(' %s: %s\n' % (str(k), str(v))) + + log_dir = opt.save_dir + '/logs_{}'.format(time_str) + if USE_TENSORBOARD: + self.writer = tensorboardX.SummaryWriter(log_dir=log_dir) + else: + if not os.path.exists(os.path.dirname(log_dir)): + os.mkdir(os.path.dirname(log_dir)) + if not os.path.exists(log_dir): + os.mkdir(log_dir) + self.log = open(log_dir + '/log.txt', 'w') + try: + os.system('cp {}/opt.txt {}/'.format(opt.save_dir, log_dir)) + except: + pass + self.start_line = True + + def write(self, txt): + if self.start_line: + time_str = time.strftime('%Y-%m-%d-%H-%M') + self.log.write('{}: {}'.format(time_str, txt)) + else: + self.log.write(txt) + self.start_line = False + if '\n' in txt: + self.start_line = True + self.log.flush() + + def close(self): + self.log.close() + + def scalar_summary(self, tag, value, step): + """Log a scalar variable.""" + if USE_TENSORBOARD: + self.writer.add_scalar(tag, value, step) diff --git a/src/lib/models/data_parallel.py b/src/lib/models/data_parallel.py new file mode 100644 index 0000000..c17c951 --- /dev/null +++ b/src/lib/models/data_parallel.py @@ -0,0 +1,133 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import torch +from torch.nn.modules import Module +from torch.nn.parallel.scatter_gather import gather +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.parallel_apply import parallel_apply + +from .scatter_gather import scatter_kwargs + + +class _DataParallel(Module): + r"""Implements data parallelism at the module level. + + This container parallelizes the application of the given module by + splitting the input across the specified devices by chunking in the batch + dimension. In the forward pass, the module is replicated on each device, + and each replica handles a portion of the input. During the backwards + pass, gradients from each replica are summed into the original module. + + The batch size should be larger than the number of GPUs used. It should + also be an integer multiple of the number of GPUs so that each chunk is the + same size (so that each GPU processes the same number of samples). + + See also: :ref:`cuda-nn-dataparallel-instead` + + Arbitrary positional and keyword inputs are allowed to be passed into + DataParallel EXCEPT Tensors. All variables will be scattered on dim + specified (default 0). Primitive types will be broadcasted, but all + other types will be a shallow copy and can be corrupted if written to in + the model's forward pass. + + Args: + module: module to be parallelized + device_ids: CUDA devices (default: all devices) + output_device: device location of output (default: device_ids[0]) + + Example:: + + >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) + >>> output = net(input_var) + """ + + # TODO: update notes/cuda.rst when this class handles 8+ GPUs well + + def __init__(self, module, device_ids=None, output_device=None, dim=0, chunk_sizes=None): + super(_DataParallel, self).__init__() + + if not torch.cuda.is_available(): + self.module = module + self.device_ids = [] + return + + if device_ids is None: + device_ids = list(range(torch.cuda.device_count())) + if output_device is None: + output_device = device_ids[0] + self.dim = dim + self.module = module + self.device_ids = device_ids + self.chunk_sizes = chunk_sizes + self.output_device = output_device + if len(self.device_ids) == 1: + self.module.cuda(device_ids[0]) + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes) + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) + outputs = self.parallel_apply(replicas, inputs, kwargs) + return self.gather(outputs, self.output_device) + + def replicate(self, module, device_ids): + return replicate(module, device_ids) + + def scatter(self, inputs, kwargs, device_ids, chunk_sizes): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes) + + def parallel_apply(self, replicas, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) + + def gather(self, outputs, output_device): + return gather(outputs, output_device, dim=self.dim) + + +def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None): + r"""Evaluates module(input) in parallel across the GPUs given in device_ids. + + This is the functional version of the DataParallel module. + + Args: + module: the module to evaluate in parallel + inputs: inputs to the module + device_ids: GPU ids on which to replicate module + output_device: GPU location of the output Use -1 to indicate the CPU. + (default: device_ids[0]) + Returns: + a Variable containing the result of module(input) located on + output_device + """ + if not isinstance(inputs, tuple): + inputs = (inputs,) + + if device_ids is None: + device_ids = list(range(torch.cuda.device_count())) + + if output_device is None: + output_device = device_ids[0] + + inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim) + if len(device_ids) == 1: + return module(*inputs[0], **module_kwargs[0]) + used_device_ids = device_ids[:len(inputs)] + replicas = replicate(module, used_device_ids) + outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids) + return gather(outputs, output_device, dim) + +def DataParallel(module, device_ids=None, output_device=None, dim=0, chunk_sizes=None): + if chunk_sizes is None: + return torch.nn.DataParallel(module, device_ids, output_device, dim) + standard_size = True + for i in range(1, len(chunk_sizes)): + if chunk_sizes[i] != chunk_sizes[0]: + standard_size = False + if standard_size: + return torch.nn.DataParallel(module, device_ids, output_device, dim) + return _DataParallel(module, device_ids, output_device, dim, chunk_sizes) \ No newline at end of file diff --git a/src/lib/models/decode.py b/src/lib/models/decode.py new file mode 100644 index 0000000..cff47f3 --- /dev/null +++ b/src/lib/models/decode.py @@ -0,0 +1,84 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +from .utils import _gather_feat, _tranpose_and_gather_feat + +def _nms(heat, kernel=3): + pad = (kernel - 1) // 2 + + hmax = nn.functional.max_pool2d( + heat, (kernel, kernel), stride=1, padding=pad) + keep = (hmax == heat).float() + return heat * keep + + +def _topk_channel(scores, K=40): + batch, cat, height, width = scores.size() + + topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) + + topk_inds = topk_inds % (height * width) + topk_ys = (topk_inds / width).int().float() + topk_xs = (topk_inds % width).int().float() + + return topk_scores, topk_inds, topk_ys, topk_xs + +def _topk(scores, K=40): + batch, cat, height, width = scores.size() + + topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) + + topk_inds = topk_inds % (height * width) + topk_ys = (topk_inds / width).int().float() + topk_xs = (topk_inds % width).int().float() + + topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) + topk_clses = (topk_ind / K).int() + topk_inds = _gather_feat( + topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) + topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) + topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) + + return topk_score, topk_inds, topk_clses, topk_ys, topk_xs + + +def mot_decode(heat, wh, reg=None, cat_spec_wh=False, K=100): + batch, cat, height, width = heat.size() + + # heat = torch.sigmoid(heat) + # perform nms on heatmaps + heat = _nms(heat) + + scores, inds, clses, ys, xs = _topk(heat, K=K) + if reg is not None: + reg = _tranpose_and_gather_feat(reg, inds) + reg = reg.view(batch, K, 2) + xs = xs.view(batch, K, 1) + reg[:, :, 0:1] + ys = ys.view(batch, K, 1) + reg[:, :, 1:2] + else: + xs = xs.view(batch, K, 1) + 0.5 + ys = ys.view(batch, K, 1) + 0.5 + wh = _tranpose_and_gather_feat(wh, inds) + if cat_spec_wh: + wh = wh.view(batch, K, cat, 2) + clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long() + wh = wh.gather(2, clses_ind).view(batch, K, 2) + else: + wh = wh.view(batch, K, 2) + clses = clses.view(batch, K, 1).float() + scores = scores.view(batch, K, 1) + bboxes = torch.cat([xs - wh[..., 0:1] / 2, + ys - wh[..., 1:2] / 2, + xs + wh[..., 0:1] / 2, + ys + wh[..., 1:2] / 2], dim=2) + detections = torch.cat([bboxes, scores, clses], dim=2) + + return detections, inds diff --git a/src/lib/models/losses.py b/src/lib/models/losses.py new file mode 100644 index 0000000..d0de9e4 --- /dev/null +++ b/src/lib/models/losses.py @@ -0,0 +1,281 @@ +# ------------------------------------------------------------------------------ +# Portions of this code are from +# CornerNet (https://github.com/princeton-vl/CornerNet) +# Copyright (c) 2018, University of Michigan +# Licensed under the BSD 3-Clause License +# ------------------------------------------------------------------------------ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +from .utils import _tranpose_and_gather_feat +import torch.nn.functional as F + + +def _slow_neg_loss(pred, gt): + '''focal loss from CornerNet''' + pos_inds = gt.eq(1) + neg_inds = gt.lt(1) + + neg_weights = torch.pow(1 - gt[neg_inds], 4) + + loss = 0 + pos_pred = pred[pos_inds] + neg_pred = pred[neg_inds] + + pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, 2) + neg_loss = torch.log(1 - neg_pred) * torch.pow(neg_pred, 2) * neg_weights + + num_pos = pos_inds.float().sum() + pos_loss = pos_loss.sum() + neg_loss = neg_loss.sum() + + if pos_pred.nelement() == 0: + loss = loss - neg_loss + else: + loss = loss - (pos_loss + neg_loss) / num_pos + return loss + + +def _neg_loss(pred, gt): + ''' Modified focal loss. Exactly the same as CornerNet. + Runs faster and costs a little bit more memory + Arguments: + pred (batch x c x h x w) + gt_regr (batch x c x h x w) + ''' + pos_inds = gt.eq(1).float() + neg_inds = gt.lt(1).float() + + neg_weights = torch.pow(1 - gt, 4) + + loss = 0 + + pos_loss = torch.log(pred) * torch.pow(1 - pred, 2) * pos_inds + neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * neg_weights * neg_inds + + num_pos = pos_inds.float().sum() + pos_loss = pos_loss.sum() + neg_loss = neg_loss.sum() + + if num_pos == 0: + loss = loss - neg_loss + else: + loss = loss - (pos_loss + neg_loss) / num_pos + return loss + +def _not_faster_neg_loss(pred, gt): + pos_inds = gt.eq(1).float() + neg_inds = gt.lt(1).float() + num_pos = pos_inds.float().sum() + neg_weights = torch.pow(1 - gt, 4) + + loss = 0 + trans_pred = pred * neg_inds + (1 - pred) * pos_inds + weight = neg_weights * neg_inds + pos_inds + all_loss = torch.log(1 - trans_pred) * torch.pow(trans_pred, 2) * weight + all_loss = all_loss.sum() + + if num_pos > 0: + all_loss /= num_pos + loss -= all_loss + return loss + +def _slow_reg_loss(regr, gt_regr, mask): + num = mask.float().sum() + mask = mask.unsqueeze(2).expand_as(gt_regr) + + regr = regr[mask] + gt_regr = gt_regr[mask] + + regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False) + regr_loss = regr_loss / (num + 1e-4) + return regr_loss + +def _reg_loss(regr, gt_regr, mask): + ''' L1 regression loss + Arguments: + regr (batch x max_objects x dim) + gt_regr (batch x max_objects x dim) + mask (batch x max_objects) + ''' + num = mask.float().sum() + mask = mask.unsqueeze(2).expand_as(gt_regr).float() + + regr = regr * mask + gt_regr = gt_regr * mask + + regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False) + regr_loss = regr_loss / (num + 1e-4) + return regr_loss + +class FocalLoss(nn.Module): + '''nn.Module warpper for focal loss''' + def __init__(self): + super(FocalLoss, self).__init__() + self.neg_loss = _neg_loss + + def forward(self, out, target): + return self.neg_loss(out, target) + +class RegLoss(nn.Module): + '''Regression loss for an output tensor + Arguments: + output (batch x dim x h x w) + mask (batch x max_objects) + ind (batch x max_objects) + target (batch x max_objects x dim) + ''' + def __init__(self): + super(RegLoss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + loss = _reg_loss(pred, target, mask) + return loss + +class RegL1Loss(nn.Module): + def __init__(self): + super(RegL1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.unsqueeze(2).expand_as(pred).float() + # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (mask.sum() + 1e-4) + return loss + +class NormRegL1Loss(nn.Module): + def __init__(self): + super(NormRegL1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.unsqueeze(2).expand_as(pred).float() + # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + pred = pred / (target + 1e-4) + target = target * 0 + 1 + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (mask.sum() + 1e-4) + return loss + +class RegWeightedL1Loss(nn.Module): + def __init__(self): + super(RegWeightedL1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.float() + # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + loss = F.l1_loss(pred * mask, target * mask, size_average=False) + loss = loss / (mask.sum() + 1e-4) + return loss + +class L1Loss(nn.Module): + def __init__(self): + super(L1Loss, self).__init__() + + def forward(self, output, mask, ind, target): + pred = _tranpose_and_gather_feat(output, ind) + mask = mask.unsqueeze(2).expand_as(pred).float() + loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean') + return loss + +class BinRotLoss(nn.Module): + def __init__(self): + super(BinRotLoss, self).__init__() + + def forward(self, output, mask, ind, rotbin, rotres): + pred = _tranpose_and_gather_feat(output, ind) + loss = compute_rot_loss(pred, rotbin, rotres, mask) + return loss + +def compute_res_loss(output, target): + return F.smooth_l1_loss(output, target, reduction='elementwise_mean') + +# TODO: weight +def compute_bin_loss(output, target, mask): + mask = mask.expand_as(output) + output = output * mask.float() + return F.cross_entropy(output, target, reduction='elementwise_mean') + +def compute_rot_loss(output, target_bin, target_res, mask): + # output: (B, 128, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos, + # bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos] + # target_bin: (B, 128, 2) [bin1_cls, bin2_cls] + # target_res: (B, 128, 2) [bin1_res, bin2_res] + # mask: (B, 128, 1) + # import pdb; pdb.set_trace() + output = output.view(-1, 8) + target_bin = target_bin.view(-1, 2) + target_res = target_res.view(-1, 2) + mask = mask.view(-1, 1) + loss_bin1 = compute_bin_loss(output[:, 0:2], target_bin[:, 0], mask) + loss_bin2 = compute_bin_loss(output[:, 4:6], target_bin[:, 1], mask) + loss_res = torch.zeros_like(loss_bin1) + if target_bin[:, 0].nonzero().shape[0] > 0: + idx1 = target_bin[:, 0].nonzero()[:, 0] + valid_output1 = torch.index_select(output, 0, idx1.long()) + valid_target_res1 = torch.index_select(target_res, 0, idx1.long()) + loss_sin1 = compute_res_loss( + valid_output1[:, 2], torch.sin(valid_target_res1[:, 0])) + loss_cos1 = compute_res_loss( + valid_output1[:, 3], torch.cos(valid_target_res1[:, 0])) + loss_res += loss_sin1 + loss_cos1 + if target_bin[:, 1].nonzero().shape[0] > 0: + idx2 = target_bin[:, 1].nonzero()[:, 0] + valid_output2 = torch.index_select(output, 0, idx2.long()) + valid_target_res2 = torch.index_select(target_res, 0, idx2.long()) + loss_sin2 = compute_res_loss( + valid_output2[:, 6], torch.sin(valid_target_res2[:, 1])) + loss_cos2 = compute_res_loss( + valid_output2[:, 7], torch.cos(valid_target_res2[:, 1])) + loss_res += loss_sin2 + loss_cos2 + return loss_bin1 + loss_bin2 + loss_res + + +class TripletLoss(nn.Module): + """Triplet loss with hard positive/negative mining. + Reference: + Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737. + Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py. + Args: + margin (float): margin for triplet. + """ + + def __init__(self, margin=0.3, mutual_flag=False): + super(TripletLoss, self).__init__() + self.margin = margin + self.ranking_loss = nn.MarginRankingLoss(margin=margin) + self.mutual = mutual_flag + + def forward(self, inputs, targets): + """ + Args: + inputs: feature matrix with shape (batch_size, feat_dim) + targets: ground truth labels with shape (num_classes) + """ + n = inputs.size(0) + # inputs = 1. * inputs / (torch.norm(inputs, 2, dim=-1, keepdim=True).expand_as(inputs) + 1e-12) + # Compute pairwise distance, replace by the official when merged + dist = torch.pow(inputs, 2).sum(dim=1, keepdim=True).expand(n, n) + dist = dist + dist.t() + dist.addmm_(1, -2, inputs, inputs.t()) + dist = dist.clamp(min=1e-12).sqrt() # for numerical stability + # For each anchor, find the hardest positive and negative + mask = targets.expand(n, n).eq(targets.expand(n, n).t()) + dist_ap, dist_an = [], [] + for i in range(n): + dist_ap.append(dist[i][mask[i]].max().unsqueeze(0)) + dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0)) + dist_ap = torch.cat(dist_ap) + dist_an = torch.cat(dist_an) + # Compute ranking hinge loss + y = torch.ones_like(dist_an) + loss = self.ranking_loss(dist_an, dist_ap, y) + if self.mutual: + return loss, dist + return loss diff --git a/src/lib/models/model.py b/src/lib/models/model.py new file mode 100644 index 0000000..2a81920 --- /dev/null +++ b/src/lib/models/model.py @@ -0,0 +1,98 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch + +from .networks.dlav0 import get_pose_net as get_dlav0 +from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn +from .networks.pose_hrnet import get_pose_net as get_pose_net_hrnet +from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn +from .networks.resnet_fpn_dcn import get_pose_net as get_pose_net_fpn_dcn + +_model_factory = { + 'dlav0': get_dlav0, # default DLAup + 'dla': get_dla_dcn, + 'resdcn': get_pose_net_dcn, + 'resfpndcn': get_pose_net_fpn_dcn, + 'hrnet': get_pose_net_hrnet +} + +def create_model(arch, heads, head_conv): + num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0 + arch = arch[:arch.find('_')] if '_' in arch else arch + get_model = _model_factory[arch] + model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv) + return model + +def load_model(model, model_path, optimizer=None, resume=False, + lr=None, lr_step=None): + start_epoch = 0 + checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage) + print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch'])) + state_dict_ = checkpoint['state_dict'] + state_dict = {} + + # convert data_parallal to model + for k in state_dict_: + if k.startswith('module') and not k.startswith('module_list'): + state_dict[k[7:]] = state_dict_[k] + else: + state_dict[k] = state_dict_[k] + model_state_dict = model.state_dict() + + # check loaded parameters and created model parameters + msg = 'If you see this, your model does not fully load the ' + \ + 'pre-trained weight. Please make sure ' + \ + 'you have correctly specified --arch xxx ' + \ + 'or set the correct --num_classes for your own dataset.' + for k in state_dict: + if k in model_state_dict: + if state_dict[k].shape != model_state_dict[k].shape: + print('Skip loading parameter {}, required shape{}, '\ + 'loaded shape{}. {}'.format( + k, model_state_dict[k].shape, state_dict[k].shape, msg)) + state_dict[k] = model_state_dict[k] + else: + print('Drop parameter {}.'.format(k) + msg) + for k in model_state_dict: + if not (k in state_dict): + print('No param {}.'.format(k) + msg) + state_dict[k] = model_state_dict[k] + model.load_state_dict(state_dict, strict=False) + + # resume optimizer parameters + if optimizer is not None and resume: + if 'optimizer' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + start_lr = lr + for step in lr_step: + if start_epoch >= step: + start_lr *= 0.1 + for param_group in optimizer.param_groups: + param_group['lr'] = start_lr + print('Resumed optimizer with start lr', start_lr) + else: + print('No optimizer parameters in checkpoint.') + if optimizer is not None: + return model, optimizer, start_epoch + else: + return model + +def save_model(path, epoch, model, optimizer=None): + if isinstance(model, torch.nn.DataParallel): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + data = {'epoch': epoch, + 'state_dict': state_dict} + if not (optimizer is None): + data['optimizer'] = optimizer.state_dict() + torch.save(data, path) + diff --git a/src/lib/models/networks/DCNv2/README.md b/src/lib/models/networks/DCNv2/README.md new file mode 100644 index 0000000..9787cfa --- /dev/null +++ b/src/lib/models/networks/DCNv2/README.md @@ -0,0 +1,65 @@ +## Deformable Convolutional Networks V2 with Pytorch 1.0 + +### Build +```bash + ./make.sh # build + python test.py # run examples and gradient check +``` + +### An Example +- deformable conv +```python + from dcn_v2 import DCN + input = torch.randn(2, 64, 128, 128).cuda() + # wrap all things (offset and mask) in DCN + dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda() + output = dcn(input) + print(output.shape) +``` +- deformable roi pooling +```python + from dcn_v2 import DCNPooling + input = torch.randn(2, 32, 64, 64).cuda() + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + + # mdformable pooling (V2) + # wrap all things (offset and mask) in DCNPooling + dpooling = DCNPooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1).cuda() + + dout = dpooling(input, rois) +``` +### Note +Now the master branch is for pytorch 1.0 (new ATen API), you can switch back to pytorch 0.4 with, +```bash +git checkout pytorch_0.4 +``` + +### Known Issues: + +- [x] Gradient check w.r.t offset (solved) +- [ ] Backward is not reentrant (minor) + +This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op). + +I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes. +However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some +non-differential points? + +Update: all gradient check passes with double precision. + +Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for +float `<1e-15` for double), +so it may not be a serious problem (?) + +Please post an issue or PR if you have any comments. + \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so b/src/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so new file mode 100644 index 0000000..ead60ec Binary files /dev/null and b/src/lib/models/networks/DCNv2/_ext.cpython-37m-x86_64-linux-gnu.so differ diff --git a/src/lib/models/networks/DCNv2/dcn_v2.py b/src/lib/models/networks/DCNv2/dcn_v2.py new file mode 100644 index 0000000..982bef5 --- /dev/null +++ b/src/lib/models/networks/DCNv2/dcn_v2.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import math +import torch +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair +from torch.autograd.function import once_differentiable + +import _ext as _backend + + +class _DCNv2(Function): + @staticmethod + def forward(ctx, input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups): + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.kernel_size = _pair(weight.shape[2:4]) + ctx.deformable_groups = deformable_groups + output = _backend.dcn_v2_forward(input, weight, bias, + offset, mask, + ctx.kernel_size[0], ctx.kernel_size[1], + ctx.stride[0], ctx.stride[1], + ctx.padding[0], ctx.padding[1], + ctx.dilation[0], ctx.dilation[1], + ctx.deformable_groups) + ctx.save_for_backward(input, offset, mask, weight, bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input, grad_offset, grad_mask, grad_weight, grad_bias = \ + _backend.dcn_v2_backward(input, weight, + bias, + offset, mask, + grad_output, + ctx.kernel_size[0], ctx.kernel_size[1], + ctx.stride[0], ctx.stride[1], + ctx.padding[0], ctx.padding[1], + ctx.dilation[0], ctx.dilation[1], + ctx.deformable_groups) + + return grad_input, grad_offset, grad_mask, grad_weight, grad_bias,\ + None, None, None, None, + + +dcn_v2_conv = _DCNv2.apply + + +class DCNv2(nn.Module): + + def __init__(self, in_channels, out_channels, + kernel_size, stride, padding, dilation=1, deformable_groups=1): + super(DCNv2, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.deformable_groups = deformable_groups + + self.weight = nn.Parameter(torch.Tensor( + out_channels, in_channels, *self.kernel_size)) + self.bias = nn.Parameter(torch.Tensor(out_channels)) + self.reset_parameters() + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + self.bias.data.zero_() + + def forward(self, input, offset, mask): + assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ + offset.shape[1] + assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ + mask.shape[1] + return dcn_v2_conv(input, offset, mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups) + + +class DCN(DCNv2): + + def __init__(self, in_channels, out_channels, + kernel_size, stride, padding, + dilation=1, deformable_groups=1): + super(DCN, self).__init__(in_channels, out_channels, + kernel_size, stride, padding, dilation, deformable_groups) + + channels_ = self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1] + self.conv_offset_mask = nn.Conv2d(self.in_channels, + channels_, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset_mask.weight.data.zero_() + self.conv_offset_mask.bias.data.zero_() + + def forward(self, input): + out = self.conv_offset_mask(input) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return dcn_v2_conv(input, offset, mask, + self.weight, self.bias, + self.stride, + self.padding, + self.dilation, + self.deformable_groups) + + + +class _DCNv2Pooling(Function): + @staticmethod + def forward(ctx, input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + ctx.spatial_scale = spatial_scale + ctx.no_trans = int(no_trans) + ctx.output_dim = output_dim + ctx.group_size = group_size + ctx.pooled_size = pooled_size + ctx.part_size = pooled_size if part_size is None else part_size + ctx.sample_per_part = sample_per_part + ctx.trans_std = trans_std + + output, output_count = \ + _backend.dcn_v2_psroi_pooling_forward(input, rois, offset, + ctx.no_trans, ctx.spatial_scale, + ctx.output_dim, ctx.group_size, + ctx.pooled_size, ctx.part_size, + ctx.sample_per_part, ctx.trans_std) + ctx.save_for_backward(input, rois, offset, output_count) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, offset, output_count = ctx.saved_tensors + grad_input, grad_offset = \ + _backend.dcn_v2_psroi_pooling_backward(grad_output, + input, + rois, + offset, + output_count, + ctx.no_trans, + ctx.spatial_scale, + ctx.output_dim, + ctx.group_size, + ctx.pooled_size, + ctx.part_size, + ctx.sample_per_part, + ctx.trans_std) + + return grad_input, None, grad_offset, \ + None, None, None, None, None, None, None, None + + +dcn_v2_pooling = _DCNv2Pooling.apply + + +class DCNv2Pooling(nn.Module): + + def __init__(self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0): + super(DCNv2Pooling, self).__init__() + self.spatial_scale = spatial_scale + self.pooled_size = pooled_size + self.output_dim = output_dim + self.no_trans = no_trans + self.group_size = group_size + self.part_size = pooled_size if part_size is None else part_size + self.sample_per_part = sample_per_part + self.trans_std = trans_std + + def forward(self, input, rois, offset): + assert input.shape[1] == self.output_dim + if self.no_trans: + offset = input.new() + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) + + +class DCNPooling(DCNv2Pooling): + + def __init__(self, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size=1, + part_size=None, + sample_per_part=4, + trans_std=.0, + deform_fc_dim=1024): + super(DCNPooling, self).__init__(spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std) + + self.deform_fc_dim = deform_fc_dim + + if not no_trans: + self.offset_mask_fc = nn.Sequential( + nn.Linear(self.pooled_size * self.pooled_size * + self.output_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.deform_fc_dim), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_dim, self.pooled_size * + self.pooled_size * 3) + ) + self.offset_mask_fc[4].weight.data.zero_() + self.offset_mask_fc[4].bias.data.zero_() + + def forward(self, input, rois): + offset = input.new() + + if not self.no_trans: + + # do roi_align first + n = rois.shape[0] + roi = dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + True, # no trans + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) + + # build mask and offset + offset_mask = self.offset_mask_fc(roi.view(n, -1)) + offset_mask = offset_mask.view( + n, 3, self.pooled_size, self.pooled_size) + o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + + # do pooling with offset and mask + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) * mask + # only roi_align + return dcn_v2_pooling(input, rois, offset, + self.spatial_scale, + self.pooled_size, + self.output_dim, + self.no_trans, + self.group_size, + self.part_size, + self.sample_per_part, + self.trans_std) diff --git a/src/lib/models/networks/DCNv2/make.sh b/src/lib/models/networks/DCNv2/make.sh new file mode 100644 index 0000000..f1f15c0 --- /dev/null +++ b/src/lib/models/networks/DCNv2/make.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +python setup.py build develop diff --git a/src/lib/models/networks/DCNv2/setup.py b/src/lib/models/networks/DCNv2/setup.py new file mode 100644 index 0000000..571b536 --- /dev/null +++ b/src/lib/models/networks/DCNv2/setup.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +import os +import glob + +import torch + +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +from setuptools import find_packages +from setuptools import setup + +requirements = ["torch", "torchvision"] + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + raise NotImplementedError('Cuda is not available') + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "_ext", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + +setup( + name="DCNv2", + version="0.1", + author="charlesshang", + url="https://github.com/charlesshang/DCNv2", + description="deformable convolutional networks", + packages=find_packages(exclude=("configs", "tests",)), + # install_requires=requirements, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp b/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp new file mode 100644 index 0000000..a68ccef --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cpu/dcn_v2_cpu.cpp @@ -0,0 +1,74 @@ +#include + +#include +#include + + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + AT_ERROR("Not implement on cpu"); +} + +std::vector +dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + AT_ERROR("Not implement on cpu"); +} + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ERROR("Not implement on cpu"); +} + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ERROR("Not implement on cpu"); +} \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/cpu/vision.h b/src/lib/models/networks/DCNv2/src/cpu/vision.h new file mode 100644 index 0000000..d5fbf1f --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cpu/vision.h @@ -0,0 +1,60 @@ +#pragma once +#include + +at::Tensor +dcn_v2_cpu_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cpu_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu new file mode 100644 index 0000000..d33cc0f --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_cuda.cu @@ -0,0 +1,238 @@ +#include +#include "cuda/dcn_v2_im2col_cuda.h" + +#include +#include + +#include +#include +#include + +extern THCState *state; + +// author: Charles Shang +// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); + // printf("Channels: %d %d\n", channels, channels_kernel); + // printf("Channels: %d %d\n", channels_out, channels_kernel); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + using scalar_t = float; + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto output_n = output.select(0, b); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + // (N x 1) (1 x M) + long m_ = channels_out; + long n_ = height_out * width_out; + long k_ = 1; + THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, + ones.contiguous().data(), k_, + bias.contiguous().data(), k_, 0.0f, + output_n.data(), n_); + + modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, + deformable_group, + columns.data()); + + //(k * m) x (m * n) + // Y = WC + long m = channels_out; + long n = height_out * width_out; + long k = channels * kernel_h * kernel_w; + THCudaBlas_Sgemm(state, 'n', 'n', n, m, k, 1.0f, + columns.data(), n, + weight.data(), k, 1.0f, + output_n.data(), n); + } + return output; +} + +std::vector dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + + THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); + THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); + + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); + AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); + AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); + AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); + + AT_ASSERTM(channels == channels_kernel, + "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); + + const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + auto ones = at::ones({height_out, width_out}, input.options()); + auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); + auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); + + auto grad_input = at::zeros_like(input); + auto grad_weight = at::zeros_like(weight); + auto grad_bias = at::zeros_like(bias); + auto grad_offset = at::zeros_like(offset); + auto grad_mask = at::zeros_like(mask); + + using scalar_t = float; + + for (int b = 0; b < batch; b++) + { + auto input_n = input.select(0, b); + auto offset_n = offset.select(0, b); + auto mask_n = mask.select(0, b); + auto grad_output_n = grad_output.select(0, b); + auto grad_input_n = grad_input.select(0, b); + auto grad_offset_n = grad_offset.select(0, b); + auto grad_mask_n = grad_mask.select(0, b); + + long m = channels * kernel_h * kernel_w; + long n = height_out * width_out; + long k = channels_out; + + THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, + grad_output_n.data(), n, + weight.data(), m, 0.0f, + columns.data(), n); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state), + columns.data(), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_offset_n.data(), + grad_mask_n.data()); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda(THCState_getCurrentStream(state), + columns.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + grad_input_n.data()); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), + input_n.data(), + offset_n.data(), + mask_n.data(), + 1, channels, height, width, + height_out, width_out, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, deformable_group, + columns.data()); + + long m_ = channels_out; + long n_ = channels * kernel_h * kernel_w; + long k_ = height_out * width_out; + + THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, + columns.data(), k_, + grad_output_n.data(), k_, 1.0f, + grad_weight.data(), n_); + + // gradient w.r.t. bias + // long m_ = channels_out; + // long k__ = height_out * width_out; + THCudaBlas_Sgemv(state, + 't', + k_, m_, 1.0f, + grad_output_n.data(), k_, + ones.data(), 1, 1.0f, + grad_bias.data(), 1); + } + + return { + grad_input, grad_offset, grad_mask, grad_weight, grad_bias + }; +} \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu new file mode 100644 index 0000000..06f6028 --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu @@ -0,0 +1,394 @@ +#include "dcn_v2_im2col_cuda.h" +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + + +__device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width, + const int height, const int width, float h, float w) +{ + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h - h_low; + float lw = w - w_low; + float hh = 1 - lh, hw = 1 - lw; + + float v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + float v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + float v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + float v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +__device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w, + const int h, const int w, const int height, const int width) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +__device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w, + const int height, const int width, const float *im_data, + const int data_width, const int bp_dir) +{ + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) + { + //empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + float weight = 0; + + if (bp_dir == 0) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + else if (bp_dir == 1) + { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +__global__ void modulated_deformable_im2col_gpu_kernel(const int n, + const float *data_im, const float *data_offset, const float *data_mask, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, + float *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; + const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; + const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + + const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) + { + for (int j = 0; j < kernel_w; ++j) + { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float val = static_cast(0); + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) + { + //const float map_h = i * dilation_h + offset_h; + //const float map_w = j * dilation_w + offset_w; + //const int cur_height = height - h_in; + //const int cur_width = width - w_in; + //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + //data_col_ptr += height_col * width_col; + } + } + } +} + +__global__ void modulated_deformable_col2im_gpu_kernel(const int n, + const float *data_col, const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int deformable_group, + const int height_col, const int width_col, + float *grad_im) +{ + CUDA_KERNEL_LOOP(index, n) + { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + const float cur_inv_h_data = h_in + i * dilation_h + offset_h; + const float cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const float cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) + { + for (int dx = -2; dx <= 2; dx++) + { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) + { + int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, + const float *data_col, const float *data_im, + const float *data_offset, const float *data_mask, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, + float *grad_offset, float *grad_mask) +{ + CUDA_KERNEL_LOOP(index, n) + { + float val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; + const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; + const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) + { + const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const float offset_h = data_offset_ptr[data_offset_h_ptr]; + const float offset_w = data_offset_ptr[data_offset_w_ptr]; + const float mask = data_mask_ptr[data_mask_hw_ptr]; + float inv_h = h_in + i * dilation_h + offset_h; + float inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) + { + inv_h = inv_w = -2; + } + else + { + mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); + } + const float weight = dmcn_get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; + } +} + +void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + modulated_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kenerl_w, + pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, + batch_size, channels, deformable_group, height_col, width_col, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float* data_col, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float* grad_im){ + + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; + modulated_deformable_col2im_gpu_kernel + <<>>( + num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, deformable_group, height_col, width_col, grad_im); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float* grad_offset, float* grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; + const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; + modulated_deformable_col2im_coord_gpu_kernel + <<>>( + num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, + batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, + grad_offset, grad_mask); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); + } +} \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h new file mode 100644 index 0000000..c856831 --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_im2col_cuda.h @@ -0,0 +1,101 @@ + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1811.11168 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + */ + +/***************** Adapted by Charles Shang *********************/ + +#ifndef DCN_V2_IM2COL_CUDA +#define DCN_V2_IM2COL_CUDA + +#ifdef __cplusplus +extern "C" +{ +#endif + + void modulated_deformable_im2col_cuda(cudaStream_t stream, + const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *data_col); + + void modulated_deformable_col2im_cuda(cudaStream_t stream, + const float *data_col, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, float *grad_im); + + void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, + const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, + const int batch_size, const int channels, const int height_im, const int width_im, + const int height_col, const int width_col, const int kernel_h, const int kenerl_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int deformable_group, + float *grad_offset, float *grad_mask); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu new file mode 100644 index 0000000..07b438e --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu @@ -0,0 +1,419 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +/***************** Adapted by Charles Shang *********************/ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N) +{ + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ T bilinear_interp( + const T *data, + const T x, + const T y, + const int width, + const int height) +{ + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, + const T *bottom_data, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + T *top_data, + T *top_count) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const T *top_diff, + const T *top_count, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, + const T *bottom_rois, + const T *bottom_trans, + const int no_trans, + const T trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) +{ + CUDA_KERNEL_LOOP(index, count) + { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; + T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; + + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) + { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) + { + for (int iw = 0; iw < sample_per_part; iw++) + { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) + { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); + + if (no_trans) + { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); + } + } + } +} + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + + auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); + + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (out.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + + AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { + DeformablePSROIPoolForwardKernel<<>>( + out_size, + input.contiguous().data(), + spatial_scale, + channels, + height, width, + pooled_height, + pooled_width, + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + output_dim, + group_size, + part_size, + num_classes, + channels_each_class, + out.data(), + top_count.data()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(out, top_count); +} + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); + AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); + AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + const int channels_trans = no_trans ? 2 : trans.size(1); + const int num_bbox = bbox.size(0); + + AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); + auto pooled_height = pooled_size; + auto pooled_width = pooled_size; + long out_size = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); + auto trans_grad = at::zeros_like(trans); + + if (input_grad.numel() == 0) + { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); + } + + dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); + dim3 block(512); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { + DeformablePSROIPoolBackwardAccKernel<<>>( + out_size, + out_grad.contiguous().data(), + top_count.contiguous().data(), + num_bbox, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + output_dim, + input_grad.contiguous().data(), + trans_grad.contiguous().data(), + input.contiguous().data(), + bbox.contiguous().data(), + trans.contiguous().data(), + no_trans, + trans_std, + sample_per_part, + group_size, + part_size, + num_classes, + channels_each_class); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(input_grad, trans_grad); +} \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/cuda/vision.h b/src/lib/models/networks/DCNv2/src/cuda/vision.h new file mode 100644 index 0000000..e42a2a7 --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/cuda/vision.h @@ -0,0 +1,60 @@ +#pragma once +#include + +at::Tensor +dcn_v2_cuda_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group); + +std::vector +dcn_v2_cuda_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group); + + +std::tuple +dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); + +std::tuple +dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std); \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/dcn_v2.h b/src/lib/models/networks/DCNv2/src/dcn_v2.h new file mode 100644 index 0000000..23f5caf --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/dcn_v2.h @@ -0,0 +1,145 @@ +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +at::Tensor +dcn_v2_forward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_forward(input, weight, bias, offset, mask, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::vector +dcn_v2_backward(const at::Tensor &input, + const at::Tensor &weight, + const at::Tensor &bias, + const at::Tensor &offset, + const at::Tensor &mask, + const at::Tensor &grad_output, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int pad_h, int pad_w, + int dilation_h, int dilation_w, + int deformable_group) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_cuda_backward(input, + weight, + bias, + offset, + mask, + grad_output, + kernel_h, kernel_w, + stride_h, stride_w, + pad_h, pad_w, + dilation_h, dilation_w, + deformable_group); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::tuple +dcn_v2_psroi_pooling_forward(const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_forward(input, + bbox, + trans, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::tuple +dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, + const at::Tensor &input, + const at::Tensor &bbox, + const at::Tensor &trans, + const at::Tensor &top_count, + const int no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) +{ + if (input.type().is_cuda()) + { +#ifdef WITH_CUDA + return dcn_v2_psroi_pooling_cuda_backward(out_grad, + input, + bbox, + trans, + top_count, + no_trans, + spatial_scale, + output_dim, + group_size, + pooled_size, + part_size, + sample_per_part, + trans_std); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} \ No newline at end of file diff --git a/src/lib/models/networks/DCNv2/src/vision.cpp b/src/lib/models/networks/DCNv2/src/vision.cpp new file mode 100644 index 0000000..ff54233 --- /dev/null +++ b/src/lib/models/networks/DCNv2/src/vision.cpp @@ -0,0 +1,9 @@ + +#include "dcn_v2.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); + m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); + m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); + m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); +} diff --git a/src/lib/models/networks/DCNv2/test.py b/src/lib/models/networks/DCNv2/test.py new file mode 100644 index 0000000..9a77508 --- /dev/null +++ b/src/lib/models/networks/DCNv2/test.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +from dcn_v2 import dcn_v2_conv, DCNv2, DCN +from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling +from torch.autograd import gradcheck + +deformable_groups = 1 +N, inC, inH, inW = 2, 2, 4, 4 +outC = 2 +kH, kW = 3, 3 + + +def conv_identify(weight, bias): + weight.data.zero_() + bias.data.zero_() + o, i, h, w = weight.shape + y = h//2 + x = w//2 + for p in range(i): + for q in range(o): + if p == q: + weight.data[q, p, y, x] = 1.0 + + +def check_zero_offset(): + conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True).cuda() + + conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, + kernel_size=(kH, kW), + stride=(1, 1), + padding=(1, 1), + bias=True).cuda() + + dcn_v2 = DCNv2(inC, outC, (kH, kW), + stride=1, padding=1, dilation=1, + deformable_groups=deformable_groups).cuda() + + conv_offset.weight.data.zero_() + conv_offset.bias.data.zero_() + conv_mask.weight.data.zero_() + conv_mask.bias.data.zero_() + conv_identify(dcn_v2.weight, dcn_v2.bias) + + input = torch.randn(N, inC, inH, inW).cuda() + offset = conv_offset(input) + mask = conv_mask(input) + mask = torch.sigmoid(mask) + output = dcn_v2(input, offset, mask) + output *= 2 + d = (input - output).abs().max() + if d < 1e-10: + print('Zero offset passed') + else: + print('Zero offset failed') + +def check_gradient_dconv(): + + input = torch.rand(N, inC, inH, inW).cuda() * 0.01 + input.requires_grad = True + + offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2 + # offset.data.zero_() + # offset.data -= 0.5 + offset.requires_grad = True + + mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda() + # mask.data.zero_() + mask.requires_grad = True + mask = torch.sigmoid(mask) + + weight = torch.randn(outC, inC, kH, kW).cuda() + weight.requires_grad = True + + bias = torch.rand(outC).cuda() + bias.requires_grad = True + + stride = 1 + padding = 1 + dilation = 1 + + print('check_gradient_dconv: ', + gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, + stride, padding, dilation, deformable_groups), + eps=1e-3, atol=1e-4, rtol=1e-2)) + + +def check_pooling_zero_offset(): + + input = torch.randn(2, 16, 64, 64).cuda().zero_() + input[0, :, 16:26, 16:26] = 1. + input[1, :, 10:20, 20:30] = 2. + rois = torch.tensor([ + [0, 65, 65, 103, 103], + [1, 81, 41, 119, 79], + ]).cuda().float() + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=True, + group_size=1, + trans_std=0.0).cuda() + + out = pooling(input, rois, input.new()) + s = ', '.join(['%f' % out[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=16, + no_trans=False, + group_size=1, + trans_std=0.0).cuda() + offset = torch.randn(20, 2, 7, 7).cuda().zero_() + dout = dpooling(input, rois, offset) + s = ', '.join(['%f' % dout[i, :, :, :].mean().item() + for i in range(rois.shape[0])]) + print(s) + + +def check_gradient_dpooling(): + input = torch.randn(2, 3, 5, 5).cuda() * 0.01 + N = 4 + batch_inds = torch.randint(2, (N, 1)).cuda().float() + x = torch.rand((N, 1)).cuda().float() * 15 + y = torch.rand((N, 1)).cuda().float() * 15 + w = torch.rand((N, 1)).cuda().float() * 10 + h = torch.rand((N, 1)).cuda().float() * 10 + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(N, 2, 3, 3).cuda() + input.requires_grad = True + offset.requires_grad = True + + spatial_scale = 1.0 / 4 + pooled_size = 3 + output_dim = 3 + no_trans = 0 + group_size = 1 + trans_std = 0.0 + sample_per_part = 4 + part_size = pooled_size + + print('check_gradient_dpooling:', + gradcheck(dcn_v2_pooling, (input, rois, offset, + spatial_scale, + pooled_size, + output_dim, + no_trans, + group_size, + part_size, + sample_per_part, + trans_std), + eps=1e-4)) + + +def example_dconv(): + input = torch.randn(2, 64, 128, 128).cuda() + # wrap all things (offset and mask) in DCN + dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, + padding=1, deformable_groups=2).cuda() + # print(dcn.weight.shape, input.shape) + output = dcn(input) + targert = output.new(*output.size()) + targert.data.uniform_(-0.01, 0.01) + error = (targert - output).mean() + error.backward() + print(output.shape) + + +def example_dpooling(): + input = torch.randn(2, 32, 64, 64).cuda() + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + offset = torch.randn(20, 2, 7, 7).cuda() + input.requires_grad = True + offset.requires_grad = True + + # normal roi_align + pooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=True, + group_size=1, + trans_std=0.1).cuda() + + # deformable pooling + dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1).cuda() + + out = pooling(input, rois, offset) + dout = dpooling(input, rois, offset) + print(out.shape) + print(dout.shape) + + target_out = out.new(*out.size()) + target_out.data.uniform_(-0.01, 0.01) + target_dout = dout.new(*dout.size()) + target_dout.data.uniform_(-0.01, 0.01) + e = (target_out - out).mean() + e.backward() + e = (target_dout - dout).mean() + e.backward() + + +def example_mdpooling(): + input = torch.randn(2, 32, 64, 64).cuda() + input.requires_grad = True + batch_inds = torch.randint(2, (20, 1)).cuda().float() + x = torch.randint(256, (20, 1)).cuda().float() + y = torch.randint(256, (20, 1)).cuda().float() + w = torch.randint(64, (20, 1)).cuda().float() + h = torch.randint(64, (20, 1)).cuda().float() + rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) + + # mdformable pooling (V2) + dpooling = DCNPooling(spatial_scale=1.0 / 4, + pooled_size=7, + output_dim=32, + no_trans=False, + group_size=1, + trans_std=0.1, + deform_fc_dim=1024).cuda() + + dout = dpooling(input, rois) + target = dout.new(*dout.size()) + target.data.uniform_(-0.1, 0.1) + error = (target - dout).mean() + error.backward() + print(dout.shape) + + +if __name__ == '__main__': + + example_dconv() + example_dpooling() + example_mdpooling() + + check_pooling_zero_offset() + # zero offset check + if inC == outC: + check_zero_offset() + + # check_gradient_dpooling() + # check_gradient_dconv() + # """ + # ****** Note: backward is not reentrant error may not be a serious problem, + # ****** since the max error is less than 1e-7, + # ****** Still looking for what trigger this problem + # """ diff --git a/src/lib/models/networks/config/__init__.py b/src/lib/models/networks/config/__init__.py new file mode 100644 index 0000000..c845930 --- /dev/null +++ b/src/lib/models/networks/config/__init__.py @@ -0,0 +1,2 @@ +from .default import _C as cfg +from .default import update_config diff --git a/src/lib/models/networks/config/default.py b/src/lib/models/networks/config/default.py new file mode 100644 index 0000000..7495ae7 --- /dev/null +++ b/src/lib/models/networks/config/default.py @@ -0,0 +1,125 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from yacs.config import CfgNode as CN + +_C = CN() + +_C.OUTPUT_DIR = '' +_C.LOG_DIR = '' +_C.DATA_DIR = '' +_C.GPUS = (0,) +_C.WORKERS = 4 +_C.PRINT_FREQ = 20 +_C.AUTO_RESUME = False +_C.PIN_MEMORY = True +_C.RANK = 0 + +# Cudnn related params +_C.CUDNN = CN() +_C.CUDNN.BENCHMARK = True +_C.CUDNN.DETERMINISTIC = False +_C.CUDNN.ENABLED = True + +# common params for NETWORK +_C.MODEL = CN() +_C.MODEL.NAME = 'pose_hrnet' +_C.MODEL.INIT_WEIGHTS = True +_C.MODEL.PRETRAINED = '' +_C.MODEL.NUM_JOINTS = 17 +_C.MODEL.TAG_PER_JOINT = True +_C.MODEL.TARGET_TYPE = 'gaussian' +_C.MODEL.IMAGE_SIZE = [256, 256] # width * height, ex: 192 * 256 +_C.MODEL.HEATMAP_SIZE = [64, 64] # width * height, ex: 24 * 32 +_C.MODEL.SIGMA = 2 +_C.MODEL.EXTRA = CN(new_allowed=True) + +_C.LOSS = CN() +_C.LOSS.USE_OHKM = False +_C.LOSS.TOPK = 8 +_C.LOSS.USE_TARGET_WEIGHT = True +_C.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False + +# DATASET related params +_C.DATASET = CN() +_C.DATASET.ROOT = '' +_C.DATASET.DATASET = 'mpii' +_C.DATASET.TRAIN_SET = 'train' +_C.DATASET.TEST_SET = 'valid' +_C.DATASET.DATA_FORMAT = 'jpg' +_C.DATASET.HYBRID_JOINTS_TYPE = '' +_C.DATASET.SELECT_DATA = False + +# training data augmentation +_C.DATASET.FLIP = True +_C.DATASET.SCALE_FACTOR = 0.25 +_C.DATASET.ROT_FACTOR = 30 +_C.DATASET.PROB_HALF_BODY = 0.0 +_C.DATASET.NUM_JOINTS_HALF_BODY = 8 +_C.DATASET.COLOR_RGB = False + +# train +_C.TRAIN = CN() + +_C.TRAIN.LR_FACTOR = 0.1 +_C.TRAIN.LR_STEP = [90, 110] +_C.TRAIN.LR = 0.001 + +_C.TRAIN.OPTIMIZER = 'adam' +_C.TRAIN.MOMENTUM = 0.9 +_C.TRAIN.WD = 0.0001 +_C.TRAIN.NESTEROV = False +_C.TRAIN.GAMMA1 = 0.99 +_C.TRAIN.GAMMA2 = 0.0 + +_C.TRAIN.BEGIN_EPOCH = 0 +_C.TRAIN.END_EPOCH = 140 + +_C.TRAIN.RESUME = False +_C.TRAIN.CHECKPOINT = '' + +_C.TRAIN.BATCH_SIZE_PER_GPU = 32 +_C.TRAIN.SHUFFLE = True + +# testing +_C.TEST = CN() + +# size of images for each device +_C.TEST.BATCH_SIZE_PER_GPU = 32 +# Test Model Epoch +_C.TEST.FLIP_TEST = False +_C.TEST.POST_PROCESS = False +_C.TEST.SHIFT_HEATMAP = False + +_C.TEST.USE_GT_BBOX = False + +# nms +_C.TEST.IMAGE_THRE = 0.1 +_C.TEST.NMS_THRE = 0.6 +_C.TEST.SOFT_NMS = False +_C.TEST.OKS_THRE = 0.5 +_C.TEST.IN_VIS_THRE = 0.0 +_C.TEST.COCO_BBOX_FILE = '' +_C.TEST.BBOX_THRE = 1.0 +_C.TEST.MODEL_FILE = '' + +# debug +_C.DEBUG = CN() +_C.DEBUG.DEBUG = False +_C.DEBUG.SAVE_BATCH_IMAGES_GT = False +_C.DEBUG.SAVE_BATCH_IMAGES_PRED = False +_C.DEBUG.SAVE_HEATMAPS_GT = False +_C.DEBUG.SAVE_HEATMAPS_PRED = False + + +def update_config(cfg, cfg_dir): + cfg.defrost() + cfg.merge_from_file(cfg_dir) + cfg.freeze() + + +if __name__ == '__main__': + import sys + with open(sys.argv[1], 'w') as f: + print(_C, file=f) \ No newline at end of file diff --git a/src/lib/models/networks/config/hrnet_w18.yaml b/src/lib/models/networks/config/hrnet_w18.yaml new file mode 100644 index 0000000..2d233c0 --- /dev/null +++ b/src/lib/models/networks/config/hrnet_w18.yaml @@ -0,0 +1,129 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + #PRETRAINED: '/home/yfzhang/PycharmProjects/FairMOT/models/pose_hrnet_w32_384x288.pth' + #PRETRAINED: '/home/yfzhang/PycharmProjects/FairMOT/models/hrnet_w32-36af842e.pth' + PRETRAINED: '../models/hrnetv2_w18_imagenet_pretrained.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 18 + - 36 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 18 + - 36 + - 72 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 18 + - 36 + - 72 + - 144 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/src/lib/models/networks/config/hrnet_w32.yaml b/src/lib/models/networks/config/hrnet_w32.yaml new file mode 100644 index 0000000..da0347e --- /dev/null +++ b/src/lib/models/networks/config/hrnet_w32.yaml @@ -0,0 +1,129 @@ +AUTO_RESUME: true +CUDNN: + BENCHMARK: true + DETERMINISTIC: false + ENABLED: true +DATA_DIR: '' +GPUS: (0,1,2,3) +OUTPUT_DIR: 'output' +LOG_DIR: 'log' +WORKERS: 24 +PRINT_FREQ: 100 + +DATASET: + COLOR_RGB: true + DATASET: 'coco' + DATA_FORMAT: jpg + FLIP: true + NUM_JOINTS_HALF_BODY: 8 + PROB_HALF_BODY: 0.3 + ROOT: 'data/coco/' + ROT_FACTOR: 45 + SCALE_FACTOR: 0.35 + TEST_SET: 'val2017' + TRAIN_SET: 'train2017' +MODEL: + INIT_WEIGHTS: true + NAME: pose_hrnet + NUM_JOINTS: 17 + #PRETRAINED: '/home/yfzhang/PycharmProjects/FairMOT/models/pose_hrnet_w32_384x288.pth' + #PRETRAINED: '/home/yfzhang/PycharmProjects/FairMOT/models/hrnet_w32-36af842e.pth' + PRETRAINED: '../models/hrnetv2_w32_imagenet_pretrained.pth' + TARGET_TYPE: gaussian + IMAGE_SIZE: + - 192 + - 256 + HEATMAP_SIZE: + - 48 + - 64 + SIGMA: 2 + EXTRA: + PRETRAINED_LAYERS: + - 'conv1' + - 'bn1' + - 'conv2' + - 'bn2' + - 'layer1' + - 'transition1' + - 'stage2' + - 'transition2' + - 'stage3' + - 'transition3' + - 'stage4' + FINAL_CONV_KERNEL: 1 + STAGE2: + NUM_MODULES: 1 + NUM_BRANCHES: 2 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + FUSE_METHOD: SUM + STAGE3: + NUM_MODULES: 4 + NUM_BRANCHES: 3 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + FUSE_METHOD: SUM + STAGE4: + NUM_MODULES: 3 + NUM_BRANCHES: 4 + BLOCK: BASIC + NUM_BLOCKS: + - 4 + - 4 + - 4 + - 4 + NUM_CHANNELS: + - 32 + - 64 + - 128 + - 256 + FUSE_METHOD: SUM +LOSS: + USE_TARGET_WEIGHT: true +TRAIN: + BATCH_SIZE_PER_GPU: 32 + SHUFFLE: true + BEGIN_EPOCH: 0 + END_EPOCH: 210 + OPTIMIZER: adam + LR: 0.001 + LR_FACTOR: 0.1 + LR_STEP: + - 170 + - 200 + WD: 0.0001 + GAMMA1: 0.99 + GAMMA2: 0.0 + MOMENTUM: 0.9 + NESTEROV: false +TEST: + BATCH_SIZE_PER_GPU: 32 + COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json' + BBOX_THRE: 1.0 + IMAGE_THRE: 0.0 + IN_VIS_THRE: 0.2 + MODEL_FILE: '' + NMS_THRE: 1.0 + OKS_THRE: 0.9 + USE_GT_BBOX: true + FLIP_TEST: true + POST_PROCESS: true + SHIFT_HEATMAP: true +DEBUG: + DEBUG: true + SAVE_BATCH_IMAGES_GT: true + SAVE_BATCH_IMAGES_PRED: true + SAVE_HEATMAPS_GT: true + SAVE_HEATMAPS_PRED: true diff --git a/src/lib/models/networks/dlav0.py b/src/lib/models/networks/dlav0.py new file mode 100644 index 0000000..9c383a0 --- /dev/null +++ b/src/lib/models/networks/dlav0.py @@ -0,0 +1,641 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +from os.path import join + +import torch +from torch import nn +import torch.utils.model_zoo as model_zoo + +import numpy as np + +BatchNorm = nn.BatchNorm2d + +def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): + return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn1 = BatchNorm(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = BatchNorm(planes) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(Bottleneck, self).__init__() + expansion = Bottleneck.expansion + bottle_planes = planes // expansion + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = BatchNorm(bottle_planes) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = BatchNorm(bottle_planes) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = BatchNorm(planes) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class BottleneckX(nn.Module): + expansion = 2 + cardinality = 32 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BottleneckX, self).__init__() + cardinality = BottleneckX.cardinality + # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) + # bottle_planes = dim * cardinality + bottle_planes = planes * cardinality // 32 + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = BatchNorm(bottle_planes) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, bias=False, + dilation=dilation, groups=cardinality) + self.bn2 = BatchNorm(bottle_planes) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = BatchNorm(planes) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class Root(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, residual): + super(Root, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, + stride=1, bias=False, padding=(kernel_size - 1) // 2) + self.bn = BatchNorm(out_channels) + self.relu = nn.ReLU(inplace=True) + self.residual = residual + + def forward(self, *x): + children = x + x = self.conv(torch.cat(x, 1)) + x = self.bn(x) + if self.residual: + x += children[0] + x = self.relu(x) + + return x + + +class Tree(nn.Module): + def __init__(self, levels, block, in_channels, out_channels, stride=1, + level_root=False, root_dim=0, root_kernel_size=1, + dilation=1, root_residual=False): + super(Tree, self).__init__() + if root_dim == 0: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + if levels == 1: + self.tree1 = block(in_channels, out_channels, stride, + dilation=dilation) + self.tree2 = block(out_channels, out_channels, 1, + dilation=dilation) + else: + self.tree1 = Tree(levels - 1, block, in_channels, out_channels, + stride, root_dim=0, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + self.tree2 = Tree(levels - 1, block, out_channels, out_channels, + root_dim=root_dim + out_channels, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + if levels == 1: + self.root = Root(root_dim, out_channels, root_kernel_size, + root_residual) + self.level_root = level_root + self.root_dim = root_dim + self.downsample = None + self.project = None + self.levels = levels + if stride > 1: + self.downsample = nn.MaxPool2d(stride, stride=stride) + if in_channels != out_channels: + self.project = nn.Sequential( + nn.Conv2d(in_channels, out_channels, + kernel_size=1, stride=1, bias=False), + BatchNorm(out_channels) + ) + + def forward(self, x, residual=None, children=None): + children = [] if children is None else children + bottom = self.downsample(x) if self.downsample else x + residual = self.project(bottom) if self.project else bottom + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, residual) + if self.levels == 1: + x2 = self.tree2(x1) + x = self.root(x2, x1, *children) + else: + children.append(x1) + x = self.tree2(x1, children=children) + return x + + +class DLA(nn.Module): + def __init__(self, levels, channels, num_classes=1000, + block=BasicBlock, residual_root=False, return_levels=False, + pool_size=7, linear_root=False): + super(DLA, self).__init__() + self.channels = channels + self.return_levels = return_levels + self.num_classes = num_classes + self.base_layer = nn.Sequential( + nn.Conv2d(3, channels[0], kernel_size=7, stride=1, + padding=3, bias=False), + BatchNorm(channels[0]), + nn.ReLU(inplace=True)) + self.level0 = self._make_conv_level( + channels[0], channels[0], levels[0]) + self.level1 = self._make_conv_level( + channels[0], channels[1], levels[1], stride=2) + self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, + level_root=False, + root_residual=residual_root) + self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, + level_root=True, root_residual=residual_root) + self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, + level_root=True, root_residual=residual_root) + self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, + level_root=True, root_residual=residual_root) + + self.avgpool = nn.AvgPool2d(pool_size) + self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1, + stride=1, padding=0, bias=True) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_level(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + nn.MaxPool2d(stride, stride=stride), + nn.Conv2d(inplanes, planes, + kernel_size=1, stride=1, bias=False), + BatchNorm(planes), + ) + + layers = [] + layers.append(block(inplanes, planes, stride, downsample=downsample)) + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): + modules = [] + for i in range(convs): + modules.extend([ + nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride if i == 0 else 1, + padding=dilation, bias=False, dilation=dilation), + BatchNorm(planes), + nn.ReLU(inplace=True)]) + inplanes = planes + return nn.Sequential(*modules) + + def forward(self, x): + y = [] + x = self.base_layer(x) + for i in range(6): + x = getattr(self, 'level{}'.format(i))(x) + y.append(x) + if self.return_levels: + return y + else: + x = self.avgpool(x) + x = self.fc(x) + x = x.view(x.size(0), -1) + + return x + + def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): + fc = self.fc + if name.endswith('.pth'): + model_weights = torch.load(data + name) + else: + model_url = get_model_url(data, name, hash) + model_weights = model_zoo.load_url(model_url) + num_classes = len(model_weights[list(model_weights.keys())[-1]]) + self.fc = nn.Conv2d( + self.channels[-1], num_classes, + kernel_size=1, stride=1, padding=0, bias=True) + self.load_state_dict(model_weights) + self.fc = fc + + +def dla34(pretrained, **kwargs): # DLA-34 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 128, 256, 512], + block=BasicBlock, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') + return model + + +def dla46_c(pretrained=None, **kwargs): # DLA-46-C + Bottleneck.expansion = 2 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 64, 128, 256], + block=Bottleneck, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla46_c') + return model + + +def dla46x_c(pretrained=None, **kwargs): # DLA-X-46-C + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 64, 128, 256], + block=BottleneckX, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla46x_c') + return model + + +def dla60x_c(pretrained, **kwargs): # DLA-X-60-C + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 2, 3, 1], + [16, 32, 64, 64, 128, 256], + block=BottleneckX, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c') + return model + + +def dla60(pretrained=None, **kwargs): # DLA-60 + Bottleneck.expansion = 2 + model = DLA([1, 1, 1, 2, 3, 1], + [16, 32, 128, 256, 512, 1024], + block=Bottleneck, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla60') + return model + + +def dla60x(pretrained=None, **kwargs): # DLA-X-60 + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 2, 3, 1], + [16, 32, 128, 256, 512, 1024], + block=BottleneckX, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla60x') + return model + + +def dla102(pretrained=None, **kwargs): # DLA-102 + Bottleneck.expansion = 2 + model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], + block=Bottleneck, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla102') + return model + + +def dla102x(pretrained=None, **kwargs): # DLA-X-102 + BottleneckX.expansion = 2 + model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], + block=BottleneckX, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla102x') + return model + + +def dla102x2(pretrained=None, **kwargs): # DLA-X-102 64 + BottleneckX.cardinality = 64 + model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024], + block=BottleneckX, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla102x2') + return model + + +def dla169(pretrained=None, **kwargs): # DLA-169 + Bottleneck.expansion = 2 + model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024], + block=Bottleneck, residual_root=True, **kwargs) + if pretrained is not None: + model.load_pretrained_model(pretrained, 'dla169') + return model + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + + +class IDAUp(nn.Module): + def __init__(self, node_kernel, out_dim, channels, up_factors): + super(IDAUp, self).__init__() + self.channels = channels + self.out_dim = out_dim + for i, c in enumerate(channels): + if c == out_dim: + proj = Identity() + else: + proj = nn.Sequential( + nn.Conv2d(c, out_dim, + kernel_size=1, stride=1, bias=False), + BatchNorm(out_dim), + nn.ReLU(inplace=True)) + f = int(up_factors[i]) + if f == 1: + up = Identity() + else: + up = nn.ConvTranspose2d( + out_dim, out_dim, f * 2, stride=f, padding=f // 2, + output_padding=0, groups=out_dim, bias=False) + fill_up_weights(up) + setattr(self, 'proj_' + str(i), proj) + setattr(self, 'up_' + str(i), up) + + for i in range(1, len(channels)): + node = nn.Sequential( + nn.Conv2d(out_dim * 2, out_dim, + kernel_size=node_kernel, stride=1, + padding=node_kernel // 2, bias=False), + BatchNorm(out_dim), + nn.ReLU(inplace=True)) + setattr(self, 'node_' + str(i), node) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def forward(self, layers): + assert len(self.channels) == len(layers), \ + '{} vs {} layers'.format(len(self.channels), len(layers)) + layers = list(layers) + for i, l in enumerate(layers): + upsample = getattr(self, 'up_' + str(i)) + project = getattr(self, 'proj_' + str(i)) + layers[i] = upsample(project(l)) + x = layers[0] + y = [] + for i in range(1, len(layers)): + node = getattr(self, 'node_' + str(i)) + x = node(torch.cat([x, layers[i]], 1)) + y.append(x) + return x, y + + +class DLAUp(nn.Module): + def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None): + super(DLAUp, self).__init__() + if in_channels is None: + in_channels = channels + self.channels = channels + channels = list(channels) + scales = np.array(scales, dtype=int) + for i in range(len(channels) - 1): + j = -i - 2 + setattr(self, 'ida_{}'.format(i), + IDAUp(3, channels[j], in_channels[j:], + scales[j:] // scales[j])) + scales[j + 1:] = scales[j] + in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] + + def forward(self, layers): + layers = list(layers) + assert len(layers) > 1 + for i in range(len(layers) - 1): + ida = getattr(self, 'ida_{}'.format(i)) + x, y = ida(layers[-i - 2:]) + layers[-i - 1:] = y + return x + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + # torch.nn.init.xavier_normal_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + +class DLASeg(nn.Module): + def __init__(self, base_name, heads, + pretrained=True, down_ratio=4, head_conv=256): + super(DLASeg, self).__init__() + assert down_ratio in [2, 4, 8, 16] + self.heads = heads + self.first_level = int(np.log2(down_ratio)) + self.base = globals()[base_name]( + pretrained=pretrained, return_levels=True) + channels = self.base.channels + scales = [2 ** i for i in range(len(channels[self.first_level:]))] + self.dla_up = DLAUp(channels[self.first_level:], scales=scales) + ''' + self.fc = nn.Sequential( + nn.Conv2d(channels[self.first_level], classes, kernel_size=1, + stride=1, padding=0, bias=True) + ) + ''' + + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(channels[self.first_level], head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=1, stride=1, + padding=0, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(channels[self.first_level], classes, + kernel_size=1, stride=1, + padding=0, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + ''' + up_factor = 2 ** self.first_level + if up_factor > 1: + up = nn.ConvTranspose2d(classes, classes, up_factor * 2, + stride=up_factor, padding=up_factor // 2, + output_padding=0, groups=classes, + bias=False) + fill_up_weights(up) + up.weight.requires_grad = False + else: + up = Identity() + self.up = up + self.softmax = nn.LogSoftmax(dim=1) + + + for m in self.fc.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm): + m.weight.data.fill_(1) + m.bias.data.zero_() + ''' + + def forward(self, x): + x = self.base(x) + x = self.dla_up(x[self.first_level:]) + # x = self.fc(x) + # y = self.softmax(self.up(x)) + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x) + return [ret] + + ''' + def optim_parameters(self, memo=None): + for param in self.base.parameters(): + yield param + for param in self.dla_up.parameters(): + yield param + for param in self.fc.parameters(): + yield param + ''' +''' +def dla34up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs) + return model + + +def dla60up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs) + return model + + +def dla102up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla102', classes, + pretrained_base=pretrained_base, **kwargs) + return model + + +def dla169up(classes, pretrained_base=None, **kwargs): + model = DLASeg('dla169', classes, + pretrained_base=pretrained_base, **kwargs) + return model +''' + +def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4): + model = DLASeg('dla{}'.format(num_layers), heads, + pretrained=True, + down_ratio=down_ratio, + head_conv=head_conv) + return model diff --git a/src/lib/models/networks/pose_dla_dcn.py b/src/lib/models/networks/pose_dla_dcn.py new file mode 100644 index 0000000..b509d0d --- /dev/null +++ b/src/lib/models/networks/pose_dla_dcn.py @@ -0,0 +1,492 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import math +from os.path import join + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.model_zoo as model_zoo +from torch import nn + +from .DCNv2.dcn_v2 import DCN + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): + return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, + stride=1, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(Bottleneck, self).__init__() + expansion = Bottleneck.expansion + bottle_planes = planes // expansion + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, + bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class BottleneckX(nn.Module): + expansion = 2 + cardinality = 32 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BottleneckX, self).__init__() + cardinality = BottleneckX.cardinality + # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) + # bottle_planes = dim * cardinality + bottle_planes = planes * cardinality // 32 + self.conv1 = nn.Conv2d(inplanes, bottle_planes, + kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, + stride=stride, padding=dilation, bias=False, + dilation=dilation, groups=cardinality) + self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(bottle_planes, planes, + kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class Root(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, residual): + super(Root, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, + stride=1, bias=False, padding=(kernel_size - 1) // 2) + self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.residual = residual + + def forward(self, *x): + children = x + x = self.conv(torch.cat(x, 1)) + x = self.bn(x) + if self.residual: + x += children[0] + x = self.relu(x) + + return x + + +class Tree(nn.Module): + def __init__(self, levels, block, in_channels, out_channels, stride=1, + level_root=False, root_dim=0, root_kernel_size=1, + dilation=1, root_residual=False): + super(Tree, self).__init__() + if root_dim == 0: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + if levels == 1: + self.tree1 = block(in_channels, out_channels, stride, + dilation=dilation) + self.tree2 = block(out_channels, out_channels, 1, + dilation=dilation) + else: + self.tree1 = Tree(levels - 1, block, in_channels, out_channels, + stride, root_dim=0, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + self.tree2 = Tree(levels - 1, block, out_channels, out_channels, + root_dim=root_dim + out_channels, + root_kernel_size=root_kernel_size, + dilation=dilation, root_residual=root_residual) + if levels == 1: + self.root = Root(root_dim, out_channels, root_kernel_size, + root_residual) + self.level_root = level_root + self.root_dim = root_dim + self.downsample = None + self.project = None + self.levels = levels + if stride > 1: + self.downsample = nn.MaxPool2d(stride, stride=stride) + if in_channels != out_channels: + self.project = nn.Sequential( + nn.Conv2d(in_channels, out_channels, + kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) + ) + + def forward(self, x, residual=None, children=None): + children = [] if children is None else children + bottom = self.downsample(x) if self.downsample else x + residual = self.project(bottom) if self.project else bottom + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, residual) + if self.levels == 1: + x2 = self.tree2(x1) + x = self.root(x2, x1, *children) + else: + children.append(x1) + x = self.tree2(x1, children=children) + return x + + +class DLA(nn.Module): + def __init__(self, levels, channels, num_classes=1000, + block=BasicBlock, residual_root=False, linear_root=False): + super(DLA, self).__init__() + self.channels = channels + self.num_classes = num_classes + self.base_layer = nn.Sequential( + nn.Conv2d(3, channels[0], kernel_size=7, stride=1, + padding=3, bias=False), + nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), + nn.ReLU(inplace=True)) + self.level0 = self._make_conv_level( + channels[0], channels[0], levels[0]) + self.level1 = self._make_conv_level( + channels[0], channels[1], levels[1], stride=2) + self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, + level_root=False, + root_residual=residual_root) + self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, + level_root=True, root_residual=residual_root) + self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, + level_root=True, root_residual=residual_root) + self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, + level_root=True, root_residual=residual_root) + + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + # m.weight.data.normal_(0, math.sqrt(2. / n)) + # elif isinstance(m, nn.BatchNorm2d): + # m.weight.data.fill_(1) + # m.bias.data.zero_() + + def _make_level(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + nn.MaxPool2d(stride, stride=stride), + nn.Conv2d(inplanes, planes, + kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(inplanes, planes, stride, downsample=downsample)) + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): + modules = [] + for i in range(convs): + modules.extend([ + nn.Conv2d(inplanes, planes, kernel_size=3, + stride=stride if i == 0 else 1, + padding=dilation, bias=False, dilation=dilation), + nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True)]) + inplanes = planes + return nn.Sequential(*modules) + + def forward(self, x): + y = [] + x = self.base_layer(x) + for i in range(6): + x = getattr(self, 'level{}'.format(i))(x) + y.append(x) + return y + + def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): + # fc = self.fc + if name.endswith('.pth'): + model_weights = torch.load(data + name) + else: + model_url = get_model_url(data, name, hash) + model_weights = model_zoo.load_url(model_url) + num_classes = len(model_weights[list(model_weights.keys())[-1]]) + self.fc = nn.Conv2d( + self.channels[-1], num_classes, + kernel_size=1, stride=1, padding=0, bias=True) + self.load_state_dict(model_weights) + # self.fc = fc + + +def dla34(pretrained=True, **kwargs): # DLA-34 + model = DLA([1, 1, 1, 2, 2, 1], + [16, 32, 64, 128, 256, 512], + block=BasicBlock, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') + return model + +class Identity(nn.Module): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + + +class DeformConv(nn.Module): + def __init__(self, chi, cho): + super(DeformConv, self).__init__() + self.actf = nn.Sequential( + nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True) + ) + self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) + + def forward(self, x): + x = self.conv(x) + x = self.actf(x) + return x + + +class IDAUp(nn.Module): + + def __init__(self, o, channels, up_f): + super(IDAUp, self).__init__() + for i in range(1, len(channels)): + c = channels[i] + f = int(up_f[i]) + proj = DeformConv(c, o) + node = DeformConv(o, o) + + up = nn.ConvTranspose2d(o, o, f * 2, stride=f, + padding=f // 2, output_padding=0, + groups=o, bias=False) + fill_up_weights(up) + + setattr(self, 'proj_' + str(i), proj) + setattr(self, 'up_' + str(i), up) + setattr(self, 'node_' + str(i), node) + + + def forward(self, layers, startp, endp): + for i in range(startp + 1, endp): + upsample = getattr(self, 'up_' + str(i - startp)) + project = getattr(self, 'proj_' + str(i - startp)) + layers[i] = upsample(project(layers[i])) + node = getattr(self, 'node_' + str(i - startp)) + layers[i] = node(layers[i] + layers[i - 1]) + + + +class DLAUp(nn.Module): + def __init__(self, startp, channels, scales, in_channels=None): + super(DLAUp, self).__init__() + self.startp = startp + if in_channels is None: + in_channels = channels + self.channels = channels + channels = list(channels) + scales = np.array(scales, dtype=int) + for i in range(len(channels) - 1): + j = -i - 2 + setattr(self, 'ida_{}'.format(i), + IDAUp(channels[j], in_channels[j:], + scales[j:] // scales[j])) + scales[j + 1:] = scales[j] + in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] + + def forward(self, layers): + out = [layers[-1]] # start with 32 + for i in range(len(layers) - self.startp - 1): + ida = getattr(self, 'ida_{}'.format(i)) + ida(layers, len(layers) -i - 2, len(layers)) + out.insert(0, layers[-1]) + return out + + +class Interpolate(nn.Module): + def __init__(self, scale, mode): + super(Interpolate, self).__init__() + self.scale = scale + self.mode = mode + + def forward(self, x): + x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) + return x + + +class DLASeg(nn.Module): + def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel, + last_level, head_conv, out_channel=0): + super(DLASeg, self).__init__() + assert down_ratio in [2, 4, 8, 16] + self.first_level = int(np.log2(down_ratio)) + self.last_level = last_level + self.base = globals()[base_name](pretrained=pretrained) + channels = self.base.channels + scales = [2 ** i for i in range(len(channels[self.first_level:]))] + self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales) + + if out_channel == 0: + out_channel = channels[self.first_level] + + self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], + [2 ** i for i in range(self.last_level - self.first_level)]) + + self.heads = heads + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(channels[self.first_level], head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=final_kernel, stride=1, + padding=final_kernel // 2, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(channels[self.first_level], classes, + kernel_size=final_kernel, stride=1, + padding=final_kernel // 2, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + def forward(self, x): + x = self.base(x) + x = self.dla_up(x) + + y = [] + for i in range(self.last_level - self.first_level): + y.append(x[i].clone()) + self.ida_up(y, 0, len(y)) + + z = {} + for head in self.heads: + z[head] = self.__getattr__(head)(y[-1]) + return [z] + + +def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4): + model = DLASeg('dla{}'.format(num_layers), heads, + pretrained=True, + down_ratio=down_ratio, + final_kernel=1, + last_level=5, + head_conv=head_conv) + return model + diff --git a/src/lib/models/networks/pose_hrnet.py b/src/lib/models/networks/pose_hrnet.py new file mode 100644 index 0000000..3f0cc34 --- /dev/null +++ b/src/lib/models/networks/pose_hrnet.py @@ -0,0 +1,550 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import logging + +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F + +from .config import cfg, update_config + + +BN_MOMENTUM = 0.01 +logger = logging.getLogger(__name__) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + def __init__(self, num_branches, blocks, num_blocks, num_inchannels, + num_channels, fuse_method, multi_scale_output=True): + super(HighResolutionModule, self).__init__() + self._check_branches( + num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.fuse_method = fuse_method + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches( + num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(True) + + def _check_branches(self, num_branches, blocks, num_blocks, + num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( + num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( + num_branches, len(num_channels)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( + num_branches, len(num_inchannels)) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d( + num_channels[branch_index] * block.expansion, + momentum=BN_MOMENTUM + ), + ) + + layers = [] + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index], + stride, + downsample + ) + ) + self.num_inchannels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.num_inchannels[branch_index], + num_channels[branch_index] + ) + ) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels) + ) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + num_inchannels = self.num_inchannels + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_inchannels[i], + 1, 1, 0, bias=False + ), + nn.BatchNorm2d(num_inchannels[i]), + nn.Upsample(scale_factor=2**(j-i), mode='nearest') + ) + ) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i-j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3) + ) + ) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, 2, 1, bias=False + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + nn.ReLU(True) + ) + ) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + y = y + self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = { + 'BASIC': BasicBlock, + 'BOTTLENECK': Bottleneck +} + + +class PoseHighResolutionNet(nn.Module): + + def __init__(self, cfg, heads): + self.inplanes = 64 + extra = cfg.MODEL.EXTRA + super(PoseHighResolutionNet, self).__init__() + + # stem net + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, + bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.layer1 = self._make_layer(Bottleneck, 64, 4) + + self.stage2_cfg = cfg['MODEL']['EXTRA']['STAGE2'] + num_channels = self.stage2_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage2_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition1 = self._make_transition_layer([256], num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + self.stage3_cfg = cfg['MODEL']['EXTRA']['STAGE3'] + num_channels = self.stage3_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage3_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition2 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + self.stage4_cfg = cfg['MODEL']['EXTRA']['STAGE4'] + num_channels = self.stage4_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage4_cfg['BLOCK']] + num_channels = [ + num_channels[i] * block.expansion for i in range(len(num_channels)) + ] + self.transition3 = self._make_transition_layer( + pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=True) + + logger.info('=> init weights from normal distribution') + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.normal_(m.weight, std=0.001) + for name, _ in m.named_parameters(): + if name in ['bias']: + nn.init.constant_(m.bias, 0) + + self.heads = heads + + last_inp_channels = np.int(np.sum(pre_stage_channels)) + + self.last_layer = nn.Sequential( + nn.Conv2d( + in_channels=last_inp_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0), + nn.BatchNorm2d(64, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True), + ) + head_conv = 256 + for head in self.heads: + classes = self.heads[head] + fc = nn.Sequential( + nn.Conv2d(64, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=extra.FINAL_CONV_KERNEL, stride=1, + padding=extra.FINAL_CONV_KERNEL // 2, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + self.pretrained_layers = cfg['MODEL']['EXTRA']['PRETRAINED_LAYERS'] + + def _make_transition_layer( + self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + nn.Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, 1, 1, bias=False + ), + nn.BatchNorm2d(num_channels_cur_layer[i]), + nn.ReLU(inplace=True) + ) + ) + else: + transition_layers.append(None) + else: + conv3x3s = [] + for j in range(i+1-num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = num_channels_cur_layer[i] \ + if j == i-num_branches_pre else inchannels + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + inchannels, outchannels, 3, 2, 1, bias=False + ), + nn.BatchNorm2d(outchannels), + nn.ReLU(inplace=True) + ) + ) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_inchannels, + multi_scale_output=True): + num_modules = layer_config['NUM_MODULES'] + num_branches = layer_config['NUM_BRANCHES'] + num_blocks = layer_config['NUM_BLOCKS'] + num_channels = layer_config['NUM_CHANNELS'] + block = blocks_dict[layer_config['BLOCK']] + fuse_method = layer_config['FUSE_METHOD'] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + + modules.append( + HighResolutionModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + fuse_method, + reset_multi_scale_output + ) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['NUM_BRANCHES']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['NUM_BRANCHES']): + if self.transition2[i] is not None: + if i < self.stage2_cfg['NUM_BRANCHES']: + x_list.append(self.transition2[i](y_list[i])) + else: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['NUM_BRANCHES']): + if self.transition3[i] is not None: + if i < self.stage3_cfg['NUM_BRANCHES']: + x_list.append(self.transition3[i](y_list[i])) + else: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + x = self.stage4(x_list) + + # Upsampling + x0_h, x0_w = x[0].size(2), x[0].size(3) + x1 = F.upsample(x[1], size=(x0_h, x0_w), mode='bilinear') + x2 = F.upsample(x[2], size=(x0_h, x0_w), mode='bilinear') + x3 = F.upsample(x[3], size=(x0_h, x0_w), mode='bilinear') + + x = torch.cat([x[0], x1, x2, x3], 1) + + x = self.last_layer(x) + + z = {} + for head in self.heads: + z[head] = self.__getattr__(head)(x) + return [z] + + def init_weights(self, pretrained=''): + if os.path.isfile(pretrained): + pretrained_state_dict = torch.load(pretrained) + logger.info('=> loading pretrained model {}'.format(pretrained)) + + need_init_state_dict = {} + for name, m in pretrained_state_dict.items(): + if name.split('.')[0] in self.pretrained_layers \ + or self.pretrained_layers[0] == '*': + need_init_state_dict[name] = m + self.load_state_dict(need_init_state_dict, strict=False) + elif pretrained: + logger.error('=> please download pre-trained models first!') + raise ValueError('{} is not exist!'.format(pretrained)) + + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +def get_pose_net(num_layers, heads, head_conv): + if num_layers == 32: + cfg_dir = '../src/lib/models/networks/config/hrnet_w32.yaml' + elif num_layers == 18: + cfg_dir = '../src/lib/models/networks/config/hrnet_w18.yaml' + else: + cfg_dir = '../src/lib/models/networks/config/hrnet_w18.yaml' + update_config(cfg, cfg_dir) + model = PoseHighResolutionNet(cfg, heads) + model.init_weights(cfg.MODEL.PRETRAINED) + + return model diff --git a/src/lib/models/networks/resnet_dcn.py b/src/lib/models/networks/resnet_dcn.py new file mode 100644 index 0000000..e3f78f8 --- /dev/null +++ b/src/lib/models/networks/resnet_dcn.py @@ -0,0 +1,289 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Dequan Wang and Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import math + +import torch.nn as nn +import torch.utils.model_zoo as model_zoo + +from .DCNv2.dcn_v2 import DCN + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + # torch.nn.init.xavier_normal_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv): + self.inplanes = 64 + self.heads = heads + self.deconv_with_bias = False + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + # used for deconv layers + self.deconv_layers = self._make_deconv_layer( + 3, + [256, 128, 64], + [4, 4, 4], + ) + + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(64, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=1, stride=1, + padding=0, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(64, classes, + kernel_size=1, stride=1, + padding=0, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel, index): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + assert num_layers == len(num_filters), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + assert num_layers == len(num_kernels), \ + 'ERROR: num_deconv_layers is different len(num_deconv_filters)' + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i], i) + + planes = num_filters[i] + fc = DCN(self.inplanes, planes, + kernel_size=(3,3), stride=1, + padding=1, dilation=1, deformable_groups=1) + # fc = nn.Conv2d(self.inplanes, planes, + # kernel_size=3, stride=1, + # padding=1, dilation=1, bias=False) + # fill_fc_weights(fc) + up = nn.ConvTranspose2d( + in_channels=planes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias) + fill_up_weights(up) + + layers.append(fc) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + layers.append(up) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.deconv_layers(x) + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(x) + return [ret] + + def init_weights(self, num_layers): + if 1: + url = model_urls['resnet{}'.format(num_layers)] + pretrained_state_dict = model_zoo.load_url(url) + print('=> loading pretrained model {}'.format(url)) + self.load_state_dict(pretrained_state_dict, strict=False) + print('=> init deconv weights from normal distribution') + for name, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net(num_layers, heads, head_conv=256): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers) + return model diff --git a/src/lib/models/networks/resnet_fpn_dcn.py b/src/lib/models/networks/resnet_fpn_dcn.py new file mode 100644 index 0000000..46f1788 --- /dev/null +++ b/src/lib/models/networks/resnet_fpn_dcn.py @@ -0,0 +1,310 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Dequan Wang and Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import math + +import torch.nn as nn +import torch.utils.model_zoo as model_zoo + +from .DCNv2.dcn_v2 import DCN + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +model_urls = { + 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', + 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', + 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', +} + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, + bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, + momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu') + # torch.nn.init.xavier_normal_(m.weight.data) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + +class PoseResNet(nn.Module): + + def __init__(self, block, layers, heads, head_conv): + self.inplanes = 64 + self.heads = heads + self.deconv_with_bias = False + + super(PoseResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + # used for deconv layers + self.deconv_layer1 = self._make_deconv_layer(256, 4) + self.deconv_layer2 = self._make_deconv_layer(128, 4) + self.deconv_layer3 = self._make_deconv_layer(64, 4) + + self.smooth_layer1 = DeformConv(256, 256) + self.smooth_layer2 = DeformConv(128, 128) + self.smooth_layer3 = DeformConv(64, 64) + + self.project_layer1 = DeformConv(256 * block.expansion, 256) + self.project_layer2 = DeformConv(128 * block.expansion, 128) + self.project_layer3 = DeformConv(64 * block.expansion, 64) + + for head in self.heads: + classes = self.heads[head] + if head_conv > 0: + fc = nn.Sequential( + nn.Conv2d(64, head_conv, + kernel_size=3, padding=1, bias=True), + nn.ReLU(inplace=True), + nn.Conv2d(head_conv, classes, + kernel_size=1, stride=1, + padding=0, bias=True)) + if 'hm' in head: + fc[-1].bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + else: + fc = nn.Conv2d(64, classes, + kernel_size=1, stride=1, + padding=0, bias=True) + if 'hm' in head: + fc.bias.data.fill_(-2.19) + else: + fill_fc_weights(fc) + self.__setattr__(head, fc) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_deconv_layer(self, num_filters, num_kernels): + + layers = [] + + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels) + + planes = num_filters + fc = DCN(self.inplanes, planes, + kernel_size=(3,3), stride=1, + padding=1, dilation=1, deformable_groups=1) + # fc = nn.Conv2d(self.inplanes, planes, + # kernel_size=3, stride=1, + # padding=1, dilation=1, bias=False) + # fill_fc_weights(fc) + up = nn.ConvTranspose2d( + in_channels=planes, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=self.deconv_with_bias) + fill_up_weights(up) + + layers.append(fc) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + layers.append(up) + layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) + layers.append(nn.ReLU(inplace=True)) + self.inplanes = planes + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + c1 = self.layer1(x) + c2 = self.layer2(c1) + c3 = self.layer3(c2) + c4 = self.layer4(c3) + + p4 = c4 + p3 = self.smooth_layer1(self.deconv_layer1(p4) + self.project_layer1(c3)) + p2 = self.smooth_layer2(self.deconv_layer2(p3) + self.project_layer2(c2)) + p1 = self.smooth_layer3(self.deconv_layer3(p2) + self.project_layer3(c1)) + + ret = {} + for head in self.heads: + ret[head] = self.__getattr__(head)(p1) + return [ret] + + def init_weights(self, num_layers): + if 1: + url = model_urls['resnet{}'.format(num_layers)] + pretrained_state_dict = model_zoo.load_url(url) + print('=> loading pretrained model {}'.format(url)) + self.load_state_dict(pretrained_state_dict, strict=False) + print('=> init deconv weights from normal distribution') + + +class DeformConv(nn.Module): + def __init__(self, chi, cho): + super(DeformConv, self).__init__() + self.actf = nn.Sequential( + nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True) + ) + self.conv = DCN(chi, cho, kernel_size=(3, 3), stride=1, padding=1, dilation=1, deformable_groups=1) + for name, m in self.actf.named_modules(): + if isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.conv(x) + x = self.actf(x) + return x + + +resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]), + 34: (BasicBlock, [3, 4, 6, 3]), + 50: (Bottleneck, [3, 4, 6, 3]), + 101: (Bottleneck, [3, 4, 23, 3]), + 152: (Bottleneck, [3, 8, 36, 3])} + + +def get_pose_net(num_layers, heads, head_conv=256): + block_class, layers = resnet_spec[num_layers] + + model = PoseResNet(block_class, layers, heads, head_conv=head_conv) + model.init_weights(num_layers) + return model diff --git a/src/lib/models/scatter_gather.py b/src/lib/models/scatter_gather.py new file mode 100644 index 0000000..8de7e3b --- /dev/null +++ b/src/lib/models/scatter_gather.py @@ -0,0 +1,43 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import torch +from torch.autograd import Variable +from torch.nn.parallel._functions import Scatter + + +def scatter(inputs, target_gpus, dim=0, chunk_sizes=None): + r""" + Slices variables into approximately equal chunks and + distributes them across given GPUs. Duplicates + references to objects that are not variables. Does not + support Tensors. + """ + def scatter_map(obj): + if isinstance(obj, Variable): + return Scatter.apply(target_gpus, chunk_sizes, dim, obj) + assert not torch.is_tensor(obj), "Tensors not supported in scatter." + if isinstance(obj, tuple): + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list): + return list(map(list, zip(*map(scatter_map, obj)))) + if isinstance(obj, dict): + return list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return [obj for targets in target_gpus] + + return scatter_map(inputs) + + +def scatter_kwargs(inputs, kwargs, target_gpus, dim=0, chunk_sizes=None): + r"""Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else [] + kwargs = scatter(kwargs, target_gpus, dim, chunk_sizes) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/src/lib/models/utils.py b/src/lib/models/utils.py new file mode 100644 index 0000000..8a02c23 --- /dev/null +++ b/src/lib/models/utils.py @@ -0,0 +1,55 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch + + +def _sigmoid(x): + y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4) + return y + +def _gather_feat(feat, ind, mask=None): + dim = feat.size(2) + ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) + feat = feat.gather(1, ind) + if mask is not None: + mask = mask.unsqueeze(2).expand_as(feat) + feat = feat[mask] + feat = feat.view(-1, dim) + return feat + +def _tranpose_and_gather_feat(feat, ind): + feat = feat.permute(0, 2, 3, 1).contiguous() + feat = feat.view(feat.size(0), -1, feat.size(3)) + feat = _gather_feat(feat, ind) + return feat + +def flip_tensor(x): + return torch.flip(x, [3]) + # tmp = x.detach().cpu().numpy()[..., ::-1].copy() + # return torch.from_numpy(tmp).to(x.device) + +def flip_lr(x, flip_idx): + tmp = x.detach().cpu().numpy()[..., ::-1].copy() + shape = tmp.shape + for e in flip_idx: + tmp[:, e[0], ...], tmp[:, e[1], ...] = \ + tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() + return torch.from_numpy(tmp.reshape(shape)).to(x.device) + +def flip_lr_off(x, flip_idx): + tmp = x.detach().cpu().numpy()[..., ::-1].copy() + shape = tmp.shape + tmp = tmp.reshape(tmp.shape[0], 17, 2, + tmp.shape[2], tmp.shape[3]) + tmp[:, :, 0, :, :] *= -1 + for e in flip_idx: + tmp[:, e[0], ...], tmp[:, e[1], ...] = \ + tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy() + return torch.from_numpy(tmp.reshape(shape)).to(x.device) \ No newline at end of file diff --git a/src/lib/opts.py b/src/lib/opts.py new file mode 100644 index 0000000..421fd7e --- /dev/null +++ b/src/lib/opts.py @@ -0,0 +1,248 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + + +class opts(object): + def __init__(self): + self.parser = argparse.ArgumentParser() + # basic experiment setting + self.parser.add_argument('task', default='mot', help='mot') + self.parser.add_argument('--dataset', default='jde', help='jde') + self.parser.add_argument('--exp_id', default='default') + self.parser.add_argument('--test', action='store_true') + #self.parser.add_argument('--load_model', default='../models/ctdet_coco_dla_2x.pth', + #help='path to pretrained model') + self.parser.add_argument('--load_model', default='', + help='path to pretrained model') + self.parser.add_argument('--resume', action='store_true', + help='resume an experiment. ' + 'Reloaded the optimizer parameter and ' + 'set load_model to model_last.pth ' + 'in the exp dir if load_model is empty.') + + # system + self.parser.add_argument('--gpus', default='0, 1', + help='-1 for CPU, use comma for multiple gpus') + self.parser.add_argument('--num_workers', type=int, default=8, + help='dataloader threads. 0 for single-thread.') + self.parser.add_argument('--not_cuda_benchmark', action='store_true', + help='disable when the input size is not fixed.') + self.parser.add_argument('--seed', type=int, default=317, + help='random seed') # from CornerNet + + # log + self.parser.add_argument('--print_iter', type=int, default=0, + help='disable progress bar and print to screen.') + self.parser.add_argument('--hide_data_time', action='store_true', + help='not display time during training.') + self.parser.add_argument('--save_all', action='store_true', + help='save model to disk every 5 epochs.') + self.parser.add_argument('--metric', default='loss', + help='main metric to save best model') + self.parser.add_argument('--vis_thresh', type=float, default=0.5, + help='visualization threshold.') + + # model + self.parser.add_argument('--arch', default='dla_34', + help='model architecture. Currently tested' + 'resdcn_34 | resdcn_50 | resfpndcn_34 |' + 'dla_34 | hrnet_32') + self.parser.add_argument('--head_conv', type=int, default=-1, + help='conv layer channels for output head' + '0 for no conv layer' + '-1 for default setting: ' + '256 for resnets and 256 for dla.') + self.parser.add_argument('--down_ratio', type=int, default=4, + help='output stride. Currently only supports 4.') + + # input + self.parser.add_argument('--input_res', type=int, default=-1, + help='input height and width. -1 for default from ' + 'dataset. Will be overriden by input_h | input_w') + self.parser.add_argument('--input_h', type=int, default=-1, + help='input height. -1 for default from dataset.') + self.parser.add_argument('--input_w', type=int, default=-1, + help='input width. -1 for default from dataset.') + + # train + self.parser.add_argument('--lr', type=float, default=1e-4, + help='learning rate for batch size 32.') + self.parser.add_argument('--lr_step', type=str, default='20,27', + help='drop learning rate by 10.') + self.parser.add_argument('--num_epochs', type=int, default=30, + help='total training epochs.') + self.parser.add_argument('--batch_size', type=int, default=12, + help='batch size') + self.parser.add_argument('--master_batch_size', type=int, default=-1, + help='batch size on the master gpu.') + self.parser.add_argument('--num_iters', type=int, default=-1, + help='default: #samples / batch_size.') + self.parser.add_argument('--val_intervals', type=int, default=5, + help='number of epochs to run validation.') + self.parser.add_argument('--trainval', action='store_true', + help='include validation in training and ' + 'test on test set') + + # test + self.parser.add_argument('--K', type=int, default=128, + help='max number of output objects.') + self.parser.add_argument('--not_prefetch_test', action='store_true', + help='not use parallal data pre-processing.') + self.parser.add_argument('--fix_res', action='store_true', + help='fix testing resolution or keep ' + 'the original resolution') + self.parser.add_argument('--keep_res', action='store_true', + help='keep the original resolution' + ' during validation.') + # tracking + self.parser.add_argument('--test_mot16', default=False, help='test mot16') + self.parser.add_argument('--val_mot15', default=False, help='val mot15') + self.parser.add_argument('--test_mot15', default=False, help='test mot15') + self.parser.add_argument('--val_mot16', default=False, help='val mot16 or mot15') + self.parser.add_argument('--test_mot17', default=False, help='test mot17') + self.parser.add_argument('--val_mot17', default=False, help='val mot17') + self.parser.add_argument('--val_mot20', default=False, help='val mot20') + self.parser.add_argument('--test_mot20', default=False, help='test mot20') + self.parser.add_argument('--conf_thres', type=float, default=0.6, help='confidence thresh for tracking') + self.parser.add_argument('--det_thres', type=float, default=0.3, help='confidence thresh for detection') + self.parser.add_argument('--nms_thres', type=float, default=0.4, help='iou thresh for nms') + self.parser.add_argument('--track_buffer', type=int, default=30, help='tracking buffer') + self.parser.add_argument('--min-box-area', type=float, default=200, help='filter out tiny boxes') + self.parser.add_argument('--input-video', type=str, default='../videos/MOT16-03.mp4', help='path to the input video') + self.parser.add_argument('--output-format', type=str, default='video', help='video or text') + self.parser.add_argument('--output-root', type=str, default='../results', help='expected output root path') + + # mot + self.parser.add_argument('--data_cfg', type=str, + default='../src/lib/cfg/data.json', + help='load data from cfg') + self.parser.add_argument('--data_dir', type=str, default='/data/yfzhang/MOT/JDE') + + # loss + self.parser.add_argument('--mse_loss', action='store_true', + help='use mse loss or focal loss to train ' + 'keypoint heatmaps.') + + self.parser.add_argument('--reg_loss', default='l1', + help='regression loss: sl1 | l1 | l2') + self.parser.add_argument('--hm_weight', type=float, default=1, + help='loss weight for keypoint heatmaps.') + self.parser.add_argument('--off_weight', type=float, default=1, + help='loss weight for keypoint local offsets.') + self.parser.add_argument('--wh_weight', type=float, default=0.1, + help='loss weight for bounding box size.') + self.parser.add_argument('--id_loss', default='ce', + help='reid loss: ce | triplet') + self.parser.add_argument('--id_weight', type=float, default=1, + help='loss weight for id') + self.parser.add_argument('--reid_dim', type=int, default=512, + help='feature dim for reid') + + self.parser.add_argument('--norm_wh', action='store_true', + help='L1(\hat(y) / y, 1) or L1(\hat(y), y)') + self.parser.add_argument('--dense_wh', action='store_true', + help='apply weighted regression near center or ' + 'just apply regression on center point.') + self.parser.add_argument('--cat_spec_wh', action='store_true', + help='category specific bounding box size.') + self.parser.add_argument('--not_reg_offset', action='store_true', + help='not regress local offset.') + + def parse(self, args=''): + if args == '': + opt = self.parser.parse_args() + else: + opt = self.parser.parse_args(args) + + opt.gpus_str = opt.gpus + opt.gpus = [int(gpu) for gpu in opt.gpus.split(',')] + opt.gpus = [i for i in range(len(opt.gpus))] if opt.gpus[0] >=0 else [-1] + opt.lr_step = [int(i) for i in opt.lr_step.split(',')] + + opt.fix_res = not opt.keep_res + print('Fix size testing.' if opt.fix_res else 'Keep resolution testing.') + opt.reg_offset = not opt.not_reg_offset + + if opt.head_conv == -1: # init default head_conv + opt.head_conv = 256 if 'dla' in opt.arch else 256 + opt.pad = 31 + opt.num_stacks = 1 + + if opt.trainval: + opt.val_intervals = 100000000 + + if opt.master_batch_size == -1: + opt.master_batch_size = opt.batch_size // len(opt.gpus) + rest_batch_size = (opt.batch_size - opt.master_batch_size) + opt.chunk_sizes = [opt.master_batch_size] + for i in range(len(opt.gpus) - 1): + slave_chunk_size = rest_batch_size // (len(opt.gpus) - 1) + if i < rest_batch_size % (len(opt.gpus) - 1): + slave_chunk_size += 1 + opt.chunk_sizes.append(slave_chunk_size) + print('training chunk_sizes:', opt.chunk_sizes) + + opt.root_dir = os.path.join(os.path.dirname(__file__), '..', '..') + opt.exp_dir = os.path.join(opt.root_dir, 'exp', opt.task) + opt.save_dir = os.path.join(opt.exp_dir, opt.exp_id) + opt.debug_dir = os.path.join(opt.save_dir, 'debug') + print('The output will be saved to ', opt.save_dir) + + if opt.resume and opt.load_model == '': + model_path = opt.save_dir[:-4] if opt.save_dir.endswith('TEST') \ + else opt.save_dir + opt.load_model = os.path.join(model_path, 'model_last.pth') + return opt + + def update_dataset_info_and_set_heads(self, opt, dataset): + input_h, input_w = dataset.default_resolution + opt.mean, opt.std = dataset.mean, dataset.std + opt.num_classes = dataset.num_classes + + # input_h(w): opt.input_h overrides opt.input_res overrides dataset default + input_h = opt.input_res if opt.input_res > 0 else input_h + input_w = opt.input_res if opt.input_res > 0 else input_w + opt.input_h = opt.input_h if opt.input_h > 0 else input_h + opt.input_w = opt.input_w if opt.input_w > 0 else input_w + opt.output_h = opt.input_h // opt.down_ratio + opt.output_w = opt.input_w // opt.down_ratio + opt.input_res = max(opt.input_h, opt.input_w) + opt.output_res = max(opt.output_h, opt.output_w) + + if opt.task == 'mot': + opt.heads = {'hm': opt.num_classes, + 'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes, + 'id': opt.reid_dim} + if opt.reg_offset: + opt.heads.update({'reg': 2}) + opt.nID = dataset.nID + opt.img_size = (1088, 608) + else: + assert 0, 'task not defined!' + print('heads', opt.heads) + return opt + + def init(self, args=''): + default_dataset_info = { + 'mot': {'default_resolution': [608, 1088], 'num_classes': 1, + 'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278], + 'dataset': 'jde', 'nID': 14455}, + } + class Struct: + def __init__(self, entries): + for k, v in entries.items(): + self.__setattr__(k, v) + opt = self.parse(args) + dataset = Struct(default_dataset_info[opt.task]) + opt.dataset = dataset.dataset + opt = self.update_dataset_info_and_set_heads(opt, dataset) + return opt diff --git a/src/lib/tracker/basetrack.py b/src/lib/tracker/basetrack.py new file mode 100644 index 0000000..bd6f20d --- /dev/null +++ b/src/lib/tracker/basetrack.py @@ -0,0 +1,57 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import numpy as np +from collections import OrderedDict + + +class TrackState(object): + New = 0 + Tracked = 1 + Lost = 2 + Removed = 3 + + +class BaseTrack(object): + _count = 0 + + track_id = 0 + is_activated = False + state = TrackState.New + + history = OrderedDict() + features = [] + curr_feature = None + score = 0 + start_frame = 0 + frame_id = 0 + time_since_update = 0 + + # multi-camera + location = (np.inf, np.inf) + + @property + def end_frame(self): + return self.frame_id + + @staticmethod + def next_id(): + BaseTrack._count += 1 + return BaseTrack._count + + def activate(self, *args): + raise NotImplementedError + + def predict(self): + raise NotImplementedError + + def update(self, *args, **kwargs): + raise NotImplementedError + + def mark_lost(self): + self.state = TrackState.Lost + + def mark_removed(self): + self.state = TrackState.Removed \ No newline at end of file diff --git a/src/lib/tracker/matching.py b/src/lib/tracker/matching.py new file mode 100644 index 0000000..f88f2cd --- /dev/null +++ b/src/lib/tracker/matching.py @@ -0,0 +1,139 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import lap +import numpy as np +import scipy +from cython_bbox import bbox_overlaps as bbox_ious +from scipy.spatial.distance import cdist +from tracking_utils import kalman_filter + + +def merge_matches(m1, m2, shape): + O,P,Q = shape + m1 = np.asarray(m1) + m2 = np.asarray(m2) + + M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) + M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) + + mask = M1*M2 + match = mask.nonzero() + match = list(zip(match[0], match[1])) + unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) + unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) + + return match, unmatched_O, unmatched_Q + + +def _indices_to_matches(cost_matrix, indices, thresh): + matched_cost = cost_matrix[tuple(zip(*indices))] + matched_mask = (matched_cost <= thresh) + + matches = indices[matched_mask] + unmatched_a = tuple(set(range(cost_matrix.shape[0])) - set(matches[:, 0])) + unmatched_b = tuple(set(range(cost_matrix.shape[1])) - set(matches[:, 1])) + + return matches, unmatched_a, unmatched_b + + +def linear_assignment(cost_matrix, thresh): + if cost_matrix.size == 0: + return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) + matches, unmatched_a, unmatched_b = [], [], [] + cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) + for ix, mx in enumerate(x): + if mx >= 0: + matches.append([ix, mx]) + unmatched_a = np.where(x < 0)[0] + unmatched_b = np.where(y < 0)[0] + matches = np.asarray(matches) + return matches, unmatched_a, unmatched_b + + +def ious(atlbrs, btlbrs): + """ + Compute cost based on IoU + :type atlbrs: list[tlbr] | np.ndarray + :type atlbrs: list[tlbr] | np.ndarray + + :rtype ious np.ndarray + """ + ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) + if ious.size == 0: + return ious + + ious = bbox_ious( + np.ascontiguousarray(atlbrs, dtype=np.float), + np.ascontiguousarray(btlbrs, dtype=np.float) + ) + + return ious + + +def iou_distance(atracks, btracks): + """ + Compute cost based on IoU + :type atracks: list[STrack] + :type btracks: list[STrack] + + :rtype cost_matrix np.ndarray + """ + + if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): + atlbrs = atracks + btlbrs = btracks + else: + atlbrs = [track.tlbr for track in atracks] + btlbrs = [track.tlbr for track in btracks] + _ious = ious(atlbrs, btlbrs) + cost_matrix = 1 - _ious + + return cost_matrix + +def embedding_distance(tracks, detections, metric='cosine'): + """ + :param tracks: list[STrack] + :param detections: list[BaseTrack] + :param metric: + :return: cost_matrix np.ndarray + """ + + cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) + if cost_matrix.size == 0: + return cost_matrix + det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) + #for i, track in enumerate(tracks): + #cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric)) + track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float) + cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric)) # Nomalized features + return cost_matrix + + +def gate_cost_matrix(kf, cost_matrix, tracks, detections, only_position=False): + if cost_matrix.size == 0: + return cost_matrix + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray([det.to_xyah() for det in detections]) + for row, track in enumerate(tracks): + gating_distance = kf.gating_distance( + track.mean, track.covariance, measurements, only_position) + cost_matrix[row, gating_distance > gating_threshold] = np.inf + return cost_matrix + + +def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98): + if cost_matrix.size == 0: + return cost_matrix + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray([det.to_xyah() for det in detections]) + for row, track in enumerate(tracks): + gating_distance = kf.gating_distance( + track.mean, track.covariance, measurements, only_position, metric='maha') + cost_matrix[row, gating_distance > gating_threshold] = np.inf + cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_) * gating_distance + return cost_matrix diff --git a/src/lib/tracker/multitracker.py b/src/lib/tracker/multitracker.py new file mode 100644 index 0000000..4fc2dc5 --- /dev/null +++ b/src/lib/tracker/multitracker.py @@ -0,0 +1,419 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from collections import deque + +import numpy as np +import torch +import torch.nn.functional as F +from models import * +from models.decode import mot_decode +from models.model import create_model, load_model +from models.utils import _tranpose_and_gather_feat +from tracker import matching +from tracking_utils.kalman_filter import KalmanFilter +from tracking_utils.log import logger +from tracking_utils.utils import * +from utils.post_process import ctdet_post_process + +from .basetrack import BaseTrack, TrackState + + +class STrack(BaseTrack): + shared_kalman = KalmanFilter() + def __init__(self, tlwh, score, temp_feat, buffer_size=30): + + # wait activate + self._tlwh = np.asarray(tlwh, dtype=np.float) + self.kalman_filter = None + self.mean, self.covariance = None, None + self.is_activated = False + + self.score = score + self.tracklet_len = 0 + + self.smooth_feat = None + self.update_features(temp_feat) + self.features = deque([], maxlen=buffer_size) + self.alpha = 0.9 + + def update_features(self, feat): + feat /= np.linalg.norm(feat) + self.curr_feat = feat + if self.smooth_feat is None: + self.smooth_feat = feat + else: + self.smooth_feat = self.alpha * self.smooth_feat + (1 - self.alpha) * feat + self.features.append(feat) + self.smooth_feat /= np.linalg.norm(self.smooth_feat) + + def predict(self): + mean_state = self.mean.copy() + if self.state != TrackState.Tracked: + mean_state[7] = 0 + self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) + + @staticmethod + def multi_predict(stracks): + if len(stracks) > 0: + multi_mean = np.asarray([st.mean.copy() for st in stracks]) + multi_covariance = np.asarray([st.covariance for st in stracks]) + for i, st in enumerate(stracks): + if st.state != TrackState.Tracked: + multi_mean[i][7] = 0 + multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance) + for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): + stracks[i].mean = mean + stracks[i].covariance = cov + + def activate(self, kalman_filter, frame_id): + """Start a new tracklet""" + self.kalman_filter = kalman_filter + self.track_id = self.next_id() + self.mean, self.covariance = self.kalman_filter.initiate(self.tlwh_to_xyah(self._tlwh)) + + self.tracklet_len = 0 + self.state = TrackState.Tracked + #self.is_activated = True + self.frame_id = frame_id + self.start_frame = frame_id + + def re_activate(self, new_track, frame_id, new_id=False): + self.mean, self.covariance = self.kalman_filter.update( + self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh) + ) + + self.update_features(new_track.curr_feat) + self.tracklet_len = 0 + self.state = TrackState.Tracked + self.is_activated = True + self.frame_id = frame_id + if new_id: + self.track_id = self.next_id() + + def update(self, new_track, frame_id, update_feature=True): + """ + Update a matched track + :type new_track: STrack + :type frame_id: int + :type update_feature: bool + :return: + """ + self.frame_id = frame_id + self.tracklet_len += 1 + + new_tlwh = new_track.tlwh + self.mean, self.covariance = self.kalman_filter.update( + self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh)) + self.state = TrackState.Tracked + self.is_activated = True + + self.score = new_track.score + if update_feature: + self.update_features(new_track.curr_feat) + + @property + # @jit(nopython=True) + def tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + if self.mean is None: + return self._tlwh.copy() + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + @property + # @jit(nopython=True) + def tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + @staticmethod + # @jit(nopython=True) + def tlwh_to_xyah(tlwh): + """Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = np.asarray(tlwh).copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret + + def to_xyah(self): + return self.tlwh_to_xyah(self.tlwh) + + @staticmethod + # @jit(nopython=True) + def tlbr_to_tlwh(tlbr): + ret = np.asarray(tlbr).copy() + ret[2:] -= ret[:2] + return ret + + @staticmethod + # @jit(nopython=True) + def tlwh_to_tlbr(tlwh): + ret = np.asarray(tlwh).copy() + ret[2:] += ret[:2] + return ret + + def __repr__(self): + return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame) + + +class JDETracker(object): + def __init__(self, opt, frame_rate=30): + self.opt = opt + if opt.gpus[0] >= 0: + opt.device = torch.device('cuda') + else: + opt.device = torch.device('cpu') + print('Creating model...') + self.model = create_model(opt.arch, opt.heads, opt.head_conv) + self.model = load_model(self.model, opt.load_model) + self.model = self.model.to(opt.device) + self.model.eval() + + self.tracked_stracks = [] # type: list[STrack] + self.lost_stracks = [] # type: list[STrack] + self.removed_stracks = [] # type: list[STrack] + + self.frame_id = 0 + self.det_thresh = opt.conf_thres + self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer) + self.max_time_lost = self.buffer_size + self.max_per_image = 128 + self.mean = np.array(opt.mean, dtype=np.float32).reshape(1, 1, 3) + self.std = np.array(opt.std, dtype=np.float32).reshape(1, 1, 3) + + self.kalman_filter = KalmanFilter() + + def post_process(self, dets, meta): + dets = dets.detach().cpu().numpy() + dets = dets.reshape(1, -1, dets.shape[2]) + dets = ctdet_post_process( + dets.copy(), [meta['c']], [meta['s']], + meta['out_height'], meta['out_width'], self.opt.num_classes) + for j in range(1, self.opt.num_classes + 1): + dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5) + return dets[0] + + def merge_outputs(self, detections): + results = {} + for j in range(1, self.opt.num_classes + 1): + results[j] = np.concatenate( + [detection[j] for detection in detections], axis=0).astype(np.float32) + + scores = np.hstack( + [results[j][:, 4] for j in range(1, self.opt.num_classes + 1)]) + if len(scores) > self.max_per_image: + kth = len(scores) - self.max_per_image + thresh = np.partition(scores, kth)[kth] + for j in range(1, self.opt.num_classes + 1): + keep_inds = (results[j][:, 4] >= thresh) + results[j] = results[j][keep_inds] + return results + + def update(self, im_blob, img0): + self.frame_id += 1 + activated_starcks = [] + refind_stracks = [] + lost_stracks = [] + removed_stracks = [] + + width = img0.shape[1] + height = img0.shape[0] + inp_height = im_blob.shape[2] + inp_width = im_blob.shape[3] + c = np.array([width / 2., height / 2.], dtype=np.float32) + s = max(float(inp_width) / float(inp_height) * height, width) * 1.0 + meta = {'c': c, 's': s, + 'out_height': inp_height // self.opt.down_ratio, + 'out_width': inp_width // self.opt.down_ratio} + + ''' Step 1: Network forward, get detections & embeddings''' + with torch.no_grad(): + output = self.model(im_blob)[-1] + hm = output['hm'].sigmoid_() + wh = output['wh'] + id_feature = output['id'] + id_feature = F.normalize(id_feature, dim=1) + + reg = output['reg'] if self.opt.reg_offset else None + dets, inds = mot_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K) + id_feature = _tranpose_and_gather_feat(id_feature, inds) + id_feature = id_feature.squeeze(0) + id_feature = id_feature.cpu().numpy() + + dets = self.post_process(dets, meta) + dets = self.merge_outputs([dets])[1] + + remain_inds = dets[:, 4] > self.opt.conf_thres + dets = dets[remain_inds] + id_feature = id_feature[remain_inds] + + # vis + ''' + for i in range(0, dets.shape[0]): + bbox = dets[i][0:4] + cv2.rectangle(img0, (bbox[0], bbox[1]), + (bbox[2], bbox[3]), + (0, 255, 0), 2) + cv2.imshow('dets', img0) + cv2.waitKey(0) + id0 = id0-1 + ''' + + if len(dets) > 0: + '''Detections''' + detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, 30) for + (tlbrs, f) in zip(dets[:, :5], id_feature)] + else: + detections = [] + + ''' Add newly detected tracklets to tracked_stracks''' + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + unconfirmed.append(track) + else: + tracked_stracks.append(track) + + ''' Step 2: First association, with embedding''' + strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) + # Predict the current location with KF + #for strack in strack_pool: + #strack.predict() + STrack.multi_predict(strack_pool) + dists = matching.embedding_distance(strack_pool, detections) + #dists = matching.gate_cost_matrix(self.kalman_filter, dists, strack_pool, detections) + dists = matching.fuse_motion(self.kalman_filter, dists, strack_pool, detections) + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + + for itracked, idet in matches: + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(detections[idet], self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + ''' Step 3: Second association, with IOU''' + detections = [detections[i] for i in u_detection] + r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == TrackState.Tracked] + dists = matching.iou_distance(r_tracked_stracks, detections) + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5) + + for itracked, idet in matches: + track = r_tracked_stracks[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(det, self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + for it in u_track: + track = r_tracked_stracks[it] + if not track.state == TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + + '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' + detections = [detections[i] for i in u_detection] + dists = matching.iou_distance(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + activated_starcks.append(unconfirmed[itracked]) + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + + """ Step 4: Init new stracks""" + for inew in u_detection: + track = detections[inew] + if track.score < self.det_thresh: + continue + track.activate(self.kalman_filter, self.frame_id) + activated_starcks.append(track) + """ Step 5: Update state""" + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + + # print('Ramained match {} s'.format(t4-t3)) + + self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] + self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) + self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) + self.removed_stracks.extend(removed_stracks) + self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks) + # get scores of lost tracks + output_stracks = [track for track in self.tracked_stracks if track.is_activated] + + logger.debug('===========Frame {}=========='.format(self.frame_id)) + logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks])) + logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks])) + logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks])) + logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks])) + + return output_stracks + + +def joint_stracks(tlista, tlistb): + exists = {} + res = [] + for t in tlista: + exists[t.track_id] = 1 + res.append(t) + for t in tlistb: + tid = t.track_id + if not exists.get(tid, 0): + exists[tid] = 1 + res.append(t) + return res + + +def sub_stracks(tlista, tlistb): + stracks = {} + for t in tlista: + stracks[t.track_id] = t + for t in tlistb: + tid = t.track_id + if stracks.get(tid, 0): + del stracks[tid] + return list(stracks.values()) + + +def remove_duplicate_stracks(stracksa, stracksb): + pdist = matching.iou_distance(stracksa, stracksb) + pairs = np.where(pdist < 0.15) + dupa, dupb = list(), list() + for p, q in zip(*pairs): + timep = stracksa[p].frame_id - stracksa[p].start_frame + timeq = stracksb[q].frame_id - stracksb[q].start_frame + if timep > timeq: + dupb.append(q) + else: + dupa.append(p) + resa = [t for i, t in enumerate(stracksa) if not i in dupa] + resb = [t for i, t in enumerate(stracksb) if not i in dupb] + return resa, resb \ No newline at end of file diff --git a/src/lib/tracking_utils/evaluation.py b/src/lib/tracking_utils/evaluation.py new file mode 100644 index 0000000..88369c3 --- /dev/null +++ b/src/lib/tracking_utils/evaluation.py @@ -0,0 +1,117 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import os +import numpy as np +import copy +import motmetrics as mm +mm.lap.default_solver = 'lap' + +from tracking_utils.io import read_results, unzip_objs + + +class Evaluator(object): + + def __init__(self, data_root, seq_name, data_type): + self.data_root = data_root + self.seq_name = seq_name + self.data_type = data_type + + self.load_annotations() + self.reset_accumulator() + + def load_annotations(self): + assert self.data_type == 'mot' + + gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') + self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True) + self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True) + + def reset_accumulator(self): + self.acc = mm.MOTAccumulator(auto_id=True) + + def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): + # results + trk_tlwhs = np.copy(trk_tlwhs) + trk_ids = np.copy(trk_ids) + + # gts + gt_objs = self.gt_frame_dict.get(frame_id, []) + gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] + + # ignore boxes + ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) + ignore_tlwhs = unzip_objs(ignore_objs)[0] + + # remove ignored results + keep = np.ones(len(trk_tlwhs), dtype=bool) + iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5) + if len(iou_distance) > 0: + match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) + match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) + match_ious = iou_distance[match_is, match_js] + + match_js = np.asarray(match_js, dtype=int) + match_js = match_js[np.logical_not(np.isnan(match_ious))] + keep[match_js] = False + trk_tlwhs = trk_tlwhs[keep] + trk_ids = trk_ids[keep] + #match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) + #match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) + #match_ious = iou_distance[match_is, match_js] + + #match_js = np.asarray(match_js, dtype=int) + #match_js = match_js[np.logical_not(np.isnan(match_ious))] + #keep[match_js] = False + #trk_tlwhs = trk_tlwhs[keep] + #trk_ids = trk_ids[keep] + + # get distance matrix + iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) + + # acc + self.acc.update(gt_ids, trk_ids, iou_distance) + + if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'): + events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics + else: + events = None + return events + + def eval_file(self, filename): + self.reset_accumulator() + + result_frame_dict = read_results(filename, self.data_type, is_gt=False) + frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys()))) + for frame_id in frames: + trk_objs = result_frame_dict.get(frame_id, []) + trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] + self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) + + return self.acc + + @staticmethod + def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): + names = copy.deepcopy(names) + if metrics is None: + metrics = mm.metrics.motchallenge_metrics + metrics = copy.deepcopy(metrics) + + mh = mm.metrics.create() + summary = mh.compute_many( + accs, + metrics=metrics, + names=names, + generate_overall=True + ) + + return summary + + @staticmethod + def save_summary(summary, filename): + import pandas as pd + writer = pd.ExcelWriter(filename) + summary.to_excel(writer) + writer.save() diff --git a/src/lib/tracking_utils/io.py b/src/lib/tracking_utils/io.py new file mode 100644 index 0000000..4944903 --- /dev/null +++ b/src/lib/tracking_utils/io.py @@ -0,0 +1,117 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import os +from typing import Dict +import numpy as np + +from tracking_utils.log import logger + + +def write_results(filename, results_dict: Dict, data_type: str): + if not filename: + return + path = os.path.dirname(filename) + if not os.path.exists(path): + os.makedirs(path) + + if data_type in ('mot', 'mcmot', 'lab'): + save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' + elif data_type == 'kitti': + save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' + else: + raise ValueError(data_type) + + with open(filename, 'w') as f: + for frame_id, frame_data in results_dict.items(): + if data_type == 'kitti': + frame_id -= 1 + for tlwh, track_id in frame_data: + if track_id < 0: + continue + x1, y1, w, h = tlwh + x2, y2 = x1 + w, y1 + h + line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) + f.write(line) + logger.info('Save results to {}'.format(filename)) + + +def read_results(filename, data_type: str, is_gt=False, is_ignore=False): + if data_type in ('mot', 'lab'): + read_fun = read_mot_results + else: + raise ValueError('Unknown data type: {}'.format(data_type)) + + return read_fun(filename, is_gt, is_ignore) + + +""" +labels={'ped', ... % 1 +'person_on_vhcl', ... % 2 +'car', ... % 3 +'bicycle', ... % 4 +'mbike', ... % 5 +'non_mot_vhcl', ... % 6 +'static_person', ... % 7 +'distractor', ... % 8 +'occluder', ... % 9 +'occluder_on_grnd', ... %10 +'occluder_full', ... % 11 +'reflection', ... % 12 +'crowd' ... % 13 +}; +""" + + +def read_mot_results(filename, is_gt, is_ignore): + valid_labels = {1} + ignore_labels = {2, 7, 8, 12} + results_dict = dict() + if os.path.isfile(filename): + with open(filename, 'r') as f: + for line in f.readlines(): + linelist = line.split(',') + if len(linelist) < 7: + continue + fid = int(linelist[0]) + if fid < 1: + continue + results_dict.setdefault(fid, list()) + + if is_gt: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + mark = int(float(linelist[6])) + if mark == 0 or label not in valid_labels: + continue + score = 1 + elif is_ignore: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + vis_ratio = float(linelist[8]) + if label not in ignore_labels and vis_ratio >= 0: + continue + else: + continue + score = 1 + else: + score = float(linelist[6]) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + + results_dict[fid].append((tlwh, target_id, score)) + + return results_dict + + +def unzip_objs(objs): + if len(objs) > 0: + tlwhs, ids, scores = zip(*objs) + else: + tlwhs, ids, scores = [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + + return tlwhs, ids, scores \ No newline at end of file diff --git a/src/lib/tracking_utils/kalman_filter.py b/src/lib/tracking_utils/kalman_filter.py new file mode 100644 index 0000000..aeeead7 --- /dev/null +++ b/src/lib/tracking_utils/kalman_filter.py @@ -0,0 +1,273 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import numpy as np +import scipy.linalg + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + #mean = np.dot(self._motion_mat, mean) + mean = np.dot(mean, self._motion_mat.T) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def multi_predict(self, mean, covariance): + """Run Kalman filter prediction step (Vectorized version). + Parameters + ---------- + mean : ndarray + The Nx8 dimensional mean matrix of the object states at the previous + time step. + covariance : ndarray + The Nx8x8 dimensional covariance matrics of the object states at the + previous time step. + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + """ + std_pos = [ + self._std_weight_position * mean[:, 3], + self._std_weight_position * mean[:, 3], + 1e-2 * np.ones_like(mean[:, 3]), + self._std_weight_position * mean[:, 3]] + std_vel = [ + self._std_weight_velocity * mean[:, 3], + self._std_weight_velocity * mean[:, 3], + 1e-5 * np.ones_like(mean[:, 3]), + self._std_weight_velocity * mean[:, 3]] + sqr = np.square(np.r_[std_pos, std_vel]).T + + motion_cov = [] + for i in range(len(mean)): + motion_cov.append(np.diag(sqr[i])) + motion_cov = np.asarray(motion_cov) + + mean = np.dot(mean, self._motion_mat.T) + left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2)) + covariance = np.dot(left, self._motion_mat.T) + motion_cov + + return mean, covariance + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False, metric='maha'): + """Compute gating distance between state distribution and measurements. + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + d = measurements - mean + if metric == 'gaussian': + return np.sum(d * d, axis=1) + elif metric == 'maha': + cholesky_factor = np.linalg.cholesky(covariance) + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha + else: + raise ValueError('invalid distance metric') \ No newline at end of file diff --git a/src/lib/tracking_utils/log.py b/src/lib/tracking_utils/log.py new file mode 100644 index 0000000..b91d0e6 --- /dev/null +++ b/src/lib/tracking_utils/log.py @@ -0,0 +1,23 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import logging + + +def get_logger(name='root'): + formatter = logging.Formatter( + # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') + fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + logger.addHandler(handler) + return logger + + +logger = get_logger('root') diff --git a/src/lib/tracking_utils/nms.py b/src/lib/tracking_utils/nms.py new file mode 100644 index 0000000..78321af --- /dev/null +++ b/src/lib/tracking_utils/nms.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from ._utils import _C +from tracking_utils import _C + +nms = _C.nms +# nms.__doc__ = """ +# This function performs Non-maximum suppresion""" diff --git a/src/lib/tracking_utils/parse_config.py b/src/lib/tracking_utils/parse_config.py new file mode 100644 index 0000000..00ae7af --- /dev/null +++ b/src/lib/tracking_utils/parse_config.py @@ -0,0 +1,40 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +def parse_model_cfg(path): + """Parses the yolo-v3 layer configuration file and returns module definitions""" + file = open(path, 'r') + lines = file.read().split('\n') + lines = [x for x in lines if x and not x.startswith('#')] + lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces + module_defs = [] + for line in lines: + if line.startswith('['): # This marks the start of a new block + module_defs.append({}) + module_defs[-1]['type'] = line[1:-1].rstrip() + if module_defs[-1]['type'] == 'convolutional': + module_defs[-1]['batch_normalize'] = 0 + else: + key, value = line.split("=") + value = value.strip() + module_defs[-1][key.rstrip()] = value.strip() + + return module_defs + + +def parse_data_cfg(path): + """Parses the data configuration file""" + options = dict() + options['gpus'] = '0' + options['num_workers'] = '10' + with open(path, 'r') as fp: + lines = fp.readlines() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#'): + continue + key, value = line.split('=') + options[key.strip()] = value.strip() + return options diff --git a/src/lib/tracking_utils/timer.py b/src/lib/tracking_utils/timer.py new file mode 100644 index 0000000..e79f1a3 --- /dev/null +++ b/src/lib/tracking_utils/timer.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import time + + +class Timer(object): + """A simple timer.""" + def __init__(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + + self.duration = 0. + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.diff = time.time() - self.start_time + self.total_time += self.diff + self.calls += 1 + self.average_time = self.total_time / self.calls + if average: + self.duration = self.average_time + else: + self.duration = self.diff + return self.duration + + def clear(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + self.duration = 0. + diff --git a/src/lib/tracking_utils/utils.py b/src/lib/tracking_utils/utils.py new file mode 100644 index 0000000..0f6fe11 --- /dev/null +++ b/src/lib/tracking_utils/utils.py @@ -0,0 +1,437 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import glob +import os +import os.path as osp +import random + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn.functional as F +from torchvision.ops import nms + +# import maskrcnn_benchmark.layers.nms as nms +# Set printoptions +torch.set_printoptions(linewidth=1320, precision=5, profile='long') +np.set_printoptions(linewidth=320, formatter={'float_kind': '{:11.5g}'.format}) # format short g, %precision=5 + +def mkdir_if_missing(d): + if not osp.exists(d): + os.makedirs(d) + + +def float3(x): # format floats to 3 decimals + return float(format(x, '.3f')) + + +def init_seeds(seed=0): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def load_classes(path): + """ + Loads class labels at 'path' + """ + fp = open(path, 'r') + names = fp.read().split('\n') + return list(filter(None, names)) # filter removes empty strings (such as last line) + + +def model_info(model): # Plots a line-by-line description of a PyTorch model + n_p = sum(x.numel() for x in model.parameters()) # number parameters + n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients + print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) + for i, (name, p) in enumerate(model.named_parameters()): + name = name.replace('module_list.', '') + print('%5g %50s %9s %12g %20s %12.3g %12.3g' % ( + i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) + print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g)) + + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): # Plots one bounding box on image img + tl = line_thickness or round(0.0004 * max(img.shape[0:2])) + 1 # line thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1) # filled + cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA) + + +def weights_init_normal(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + torch.nn.init.normal_(m.weight.data, 0.0, 0.03) + elif classname.find('BatchNorm2d') != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.03) + torch.nn.init.constant_(m.bias.data, 0.0) + + +def xyxy2xywh(x): + # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] + x[:, 2]) / 2 + y[:, 1] = (x[:, 1] + x[:, 3]) / 2 + y[:, 2] = x[:, 2] - x[:, 0] + y[:, 3] = x[:, 3] - x[:, 1] + return y + + +def xywh2xyxy(x): + # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] - x[:, 2] / 2) + y[:, 1] = (x[:, 1] - x[:, 3] / 2) + y[:, 2] = (x[:, 0] + x[:, 2] / 2) + y[:, 3] = (x[:, 1] + x[:, 3] / 2) + return y + + +def scale_coords(img_size, coords, img0_shape): + # Rescale x1, y1, x2, y2 from 416 to image size + gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new + gain_h = float(img_size[1]) / img0_shape[0] + gain = min(gain_w, gain_h) + pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding + pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding + coords[:, [0, 2]] -= pad_x + coords[:, [1, 3]] -= pad_y + coords[:, 0:4] /= gain + coords[:, :4] = torch.clamp(coords[:, :4], min=0) + return coords + + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ Compute the average precision, given the recall and precision curves. + Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (list). + conf: Objectness value from 0-1 (list). + pred_cls: Predicted object classes (list). + target_cls: True object classes (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # lists/pytorch to numpy + tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls) + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = sum(target_cls == c) # Number of ground truth objects + n_p = sum(i) # Number of predicted objects + + if (n_p == 0) and (n_gt == 0): + continue + elif (n_p == 0) or (n_gt == 0): + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = np.cumsum(1 - tp[i]) + tpc = np.cumsum(tp[i]) + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(tpc[-1] / (n_gt + 1e-16)) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(tpc[-1] / (tpc[-1] + fpc[-1])) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p) + + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def bbox_iou(box1, box2, x1y1x2y2=False): + """ + Returns the IoU of two bounding boxes + """ + N, M = len(box1), len(box2) + if x1y1x2y2: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + else: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + + # get the coordinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1) + inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1) + inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2) + inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2) + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) + # Union Area + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)) + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1,1).expand(N,M) + b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1,-1).expand(N,M) + + return inter_area / (b1_area + b2_area - inter_area + 1e-16) + + +def build_targets_max(target, anchor_wh, nA, nC, nGh, nGw): + """ + returns nT, nCorrect, tx, ty, tw, th, tconf, tcls + """ + nB = len(target) # number of images in batch + + txy = torch.zeros(nB, nA, nGh, nGw, 2).cuda() # batch size, anchors, grid size + twh = torch.zeros(nB, nA, nGh, nGw, 2).cuda() + tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() + tcls = torch.ByteTensor(nB, nA, nGh, nGw, nC).fill_(0).cuda() # nC = number of classes + tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() + for b in range(nB): + t = target[b] + t_id = t[:, 1].clone().long().cuda() + t = t[:,[0,2,3,4,5]] + nTb = len(t) # number of targets + if nTb == 0: + continue + + #gxy, gwh = t[:, 1:3] * nG, t[:, 3:5] * nG + gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + gi = torch.clamp(gxy[:, 0], min=0, max=nGw -1).long() + gj = torch.clamp(gxy[:, 1], min=0, max=nGh -1).long() + + # Get grid box indices and prevent overflows (i.e. 13.01 on 13 anchors) + #gi, gj = torch.clamp(gxy.long(), min=0, max=nG - 1).t() + #gi, gj = gxy.long().t() + + # iou of targets-anchors (using wh only) + box1 = gwh + box2 = anchor_wh.unsqueeze(1) + inter_area = torch.min(box1, box2).prod(2) + iou = inter_area / (box1.prod(1) + box2.prod(2) - inter_area + 1e-16) + + # Select best iou_pred and anchor + iou_best, a = iou.max(0) # best anchor [0-2] for each target + + # Select best unique target-anchor combinations + if nTb > 1: + _, iou_order = torch.sort(-iou_best) # best to worst + + # Unique anchor selection + u = torch.stack((gi, gj, a), 0)[:, iou_order] + # _, first_unique = np.unique(u, axis=1, return_index=True) # first unique indices + first_unique = return_torch_unique_index(u, torch.unique(u, dim=1)) # torch alternative + i = iou_order[first_unique] + # best anchor must share significant commonality (iou) with target + i = i[iou_best[i] > 0.60] # TODO: examine arbitrary threshold + if len(i) == 0: + continue + + a, gj, gi, t = a[i], gj[i], gi[i], t[i] + t_id = t_id[i] + if len(t.shape) == 1: + t = t.view(1, 5) + else: + if iou_best < 0.60: + continue + + tc, gxy, gwh = t[:, 0].long(), t[:, 1:3].clone(), t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + + # XY coordinates + txy[b, a, gj, gi] = gxy - gxy.floor() + + # Width and height + twh[b, a, gj, gi] = torch.log(gwh / anchor_wh[a]) # yolo method + # twh[b, a, gj, gi] = torch.sqrt(gwh / anchor_wh[a]) / 2 # power method + + # One-hot encoding of label + tcls[b, a, gj, gi, tc] = 1 + tconf[b, a, gj, gi] = 1 + tid[b, a, gj, gi] = t_id.unsqueeze(1) + tbox = torch.cat([txy, twh], -1) + return tconf, tbox, tid + + + + +def generate_anchor(nGh, nGw, anchor_wh): + nA = len(anchor_wh) + yy, xx =torch.meshgrid(torch.arange(nGh), torch.arange(nGw)) + xx, yy = xx.cuda(), yy.cuda() + + mesh = torch.stack([xx, yy], dim=0) # Shape 2, nGh, nGw + mesh = mesh.unsqueeze(0).repeat(nA,1,1,1).float() # Shape nA x 2 x nGh x nGw + anchor_offset_mesh = anchor_wh.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, nGh,nGw) # Shape nA x 2 x nGh x nGw + anchor_mesh = torch.cat([mesh, anchor_offset_mesh], dim=1) # Shape nA x 4 x nGh x nGw + return anchor_mesh + +def encode_delta(gt_box_list, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ + gt_box_list[:, 2], gt_box_list[:, 3] + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw/pw) + dh = torch.log(gh/ph) + return torch.stack([dx, dy, dw, dh], dim=1) + +def decode_delta(delta, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] + gx = pw * dx + px + gy = ph * dy + py + gw = pw * torch.exp(dw) + gh = ph * torch.exp(dh) + return torch.stack([gx, gy, gw, gh], dim=1) + +def decode_delta_map(delta_map, anchors): + ''' + :param: delta_map, shape (nB, nA, nGh, nGw, 4) + :param: anchors, shape (nA,4) + ''' + nB, nA, nGh, nGw, _ = delta_map.shape + anchor_mesh = generate_anchor(nGh, nGw, anchors) + anchor_mesh = anchor_mesh.permute(0,2,3,1).contiguous() # Shpae (nA x nGh x nGw) x 4 + anchor_mesh = anchor_mesh.unsqueeze(0).repeat(nB,1,1,1,1) + pred_list = decode_delta(delta_map.view(-1,4), anchor_mesh.view(-1,4)) + pred_map = pred_list.view(nB, nA, nGh, nGw, 4) + return pred_map + + +def pooling_nms(heatmap, kernel=1): + pad = (kernel -1 ) // 2 + hmax = F.max_pool2d(heatmap, (kernel, kernel), stride=1, padding=pad) + keep = (hmax == heatmap).float() + return keep * heatmap + + +def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.2): + """ + Removes detections with lower object confidence score than 'conf_thres' + Non-Maximum Suppression to further filter detections. + Returns detections with shape: + (x1, y1, x2, y2, object_conf, class_score, class_pred) + """ + + output = [None for _ in range(len(prediction))] + for image_i, pred in enumerate(prediction): + # Filter out confidence scores below threshold + # Get score and class with highest confidence + + v = pred[:, 4] > conf_thres + v = v.nonzero().squeeze() + if len(v.shape) == 0: + v = v.unsqueeze(0) + + pred = pred[v] + + # If none are remaining => process next image + nP = pred.shape[0] + if not nP: + continue + # From (center x, center y, width, height) to (x1, y1, x2, y2) + pred[:, :4] = xywh2xyxy(pred[:, :4]) + nms_indices = nms(pred[:, :4], pred[:, 4], nms_thres) + det_max = pred[nms_indices] + + if len(det_max) > 0: + # Add max detections to outputs + output[image_i] = det_max if output[image_i] is None else torch.cat((output[image_i], det_max)) + + return output + + +def return_torch_unique_index(u, uv): + n = uv.shape[1] # number of columns + first_unique = torch.zeros(n, device=u.device).long() + for j in range(n): + first_unique[j] = (uv[:, j:j + 1] == u).all(0).nonzero()[0] + + return first_unique + + +def strip_optimizer_from_checkpoint(filename='weights/best.pt'): + # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size) + + a = torch.load(filename, map_location='cpu') + a['optimizer'] = [] + torch.save(a, filename.replace('.pt', '_lite.pt')) + + +def plot_results(): + # Plot YOLO training results file 'results.txt' + # import os; os.system('wget https://storage.googleapis.com/ultralytics/yolov3/results_v1.txt') + + plt.figure(figsize=(14, 7)) + s = ['X + Y', 'Width + Height', 'Confidence', 'Classification', 'Total Loss', 'mAP', 'Recall', 'Precision'] + files = sorted(glob.glob('results*.txt')) + for f in files: + results = np.loadtxt(f, usecols=[2, 3, 4, 5, 6, 9, 10, 11]).T # column 11 is mAP + x = range(1, results.shape[1]) + for i in range(8): + plt.subplot(2, 4, i + 1) + plt.plot(x, results[i, x], marker='.', label=f) + plt.title(s[i]) + if i == 0: + plt.legend() diff --git a/src/lib/tracking_utils/visualization.py b/src/lib/tracking_utils/visualization.py new file mode 100644 index 0000000..b2ca803 --- /dev/null +++ b/src/lib/tracking_utils/visualization.py @@ -0,0 +1,95 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +import numpy as np +import cv2 + + +def tlwhs_to_tlbrs(tlwhs): + tlbrs = np.copy(tlwhs) + if len(tlbrs) == 0: + return tlbrs + tlbrs[:, 2] += tlwhs[:, 0] + tlbrs[:, 3] += tlwhs[:, 1] + return tlbrs + + +def get_color(idx): + idx = idx * 3 + color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) + + return color + + +def resize_image(image, max_size=800): + if max(image.shape[:2]) > max_size: + scale = float(max_size) / max(image.shape[:2]) + image = cv2.resize(image, None, fx=scale, fy=scale) + return image + + +def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None): + im = np.ascontiguousarray(np.copy(image)) + im_h, im_w = im.shape[:2] + + top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 + + text_scale = max(1, image.shape[1] / 1600.) + text_thickness = 1 if text_scale > 1.1 else 1 + line_thickness = max(1, int(image.shape[1] / 500.)) + + radius = max(5, int(im_w/140.)) + cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), + (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2) + + for i, tlwh in enumerate(tlwhs): + x1, y1, w, h = tlwh + intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) + obj_id = int(obj_ids[i]) + id_text = '{}'.format(int(obj_id)) + if ids2 is not None: + id_text = id_text + ', {}'.format(int(ids2[i])) + _line_thickness = 1 if obj_id <= 0 else line_thickness + color = get_color(abs(obj_id)) + cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) + cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), + thickness=text_thickness) + return im + + +def plot_trajectory(image, tlwhs, track_ids): + image = image.copy() + for one_tlwhs, track_id in zip(tlwhs, track_ids): + color = get_color(int(track_id)) + for tlwh in one_tlwhs: + x1, y1, w, h = tuple(map(int, tlwh)) + cv2.circle(image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2) + + return image + + +def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None): + im = np.copy(image) + text_scale = max(1, image.shape[1] / 800.) + thickness = 2 if text_scale > 1.3 else 1 + for i, det in enumerate(tlbrs): + x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int) + if len(det) >= 7: + label = 'det' if det[5] > 0 else 'trk' + if ids is not None: + text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i]) + cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), + thickness=thickness) + else: + text = '{}# {:.2f}'.format(label, det[6]) + + if scores is not None: + text = '{:.2f}'.format(scores[i]) + cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), + thickness=thickness) + + cv2.rectangle(im, (x1, y1), (x2, y2), color, 2) + + return im diff --git a/src/lib/trains/base_trainer.py b/src/lib/trains/base_trainer.py new file mode 100644 index 0000000..3e59eb3 --- /dev/null +++ b/src/lib/trains/base_trainer.py @@ -0,0 +1,124 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +import torch +from progress.bar import Bar +from models.data_parallel import DataParallel +from utils.utils import AverageMeter + + +class ModleWithLoss(torch.nn.Module): + def __init__(self, model, loss): + super(ModleWithLoss, self).__init__() + self.model = model + self.loss = loss + + def forward(self, batch): + outputs = self.model(batch['input']) + loss, loss_stats = self.loss(outputs, batch) + return outputs[-1], loss, loss_stats + +class BaseTrainer(object): + def __init__( + self, opt, model, optimizer=None): + self.opt = opt + self.optimizer = optimizer + self.loss_stats, self.loss = self._get_losses(opt) + self.model_with_loss = ModleWithLoss(model, self.loss) + #self.optimizer.add_param_group({'params': self.loss.parameters()}) + + def set_device(self, gpus, chunk_sizes, device): + if len(gpus) > 1: + self.model_with_loss = DataParallel( + self.model_with_loss, device_ids=gpus, + chunk_sizes=chunk_sizes).to(device) + else: + self.model_with_loss = self.model_with_loss.to(device) + + for state in self.optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.to(device=device, non_blocking=True) + + def run_epoch(self, phase, epoch, data_loader): + model_with_loss = self.model_with_loss + if phase == 'train': + model_with_loss.train() + else: + if len(self.opt.gpus) > 1: + model_with_loss = self.model_with_loss.module + model_with_loss.eval() + torch.cuda.empty_cache() + + opt = self.opt + results = {} + data_time, batch_time = AverageMeter(), AverageMeter() + avg_loss_stats = {l: AverageMeter() for l in self.loss_stats} + num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters + bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters) + end = time.time() + for iter_id, batch in enumerate(data_loader): + if iter_id >= num_iters: + break + data_time.update(time.time() - end) + + for k in batch: + if k != 'meta': + batch[k] = batch[k].to(device=opt.device, non_blocking=True) + + output, loss, loss_stats = model_with_loss(batch) + loss = loss.mean() + if phase == 'train': + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + batch_time.update(time.time() - end) + end = time.time() + + Bar.suffix = '{phase}: [{0}][{1}/{2}]|Tot: {total:} |ETA: {eta:} '.format( + epoch, iter_id, num_iters, phase=phase, + total=bar.elapsed_td, eta=bar.eta_td) + for l in avg_loss_stats: + avg_loss_stats[l].update( + loss_stats[l].mean().item(), batch['input'].size(0)) + Bar.suffix = Bar.suffix + '|{} {:.4f} '.format(l, avg_loss_stats[l].avg) + if not opt.hide_data_time: + Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \ + '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time) + if opt.print_iter > 0: + if iter_id % opt.print_iter == 0: + print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix)) + else: + bar.next() + + if opt.test: + self.save_result(output, batch, results) + del output, loss, loss_stats, batch + + bar.finish() + ret = {k: v.avg for k, v in avg_loss_stats.items()} + ret['time'] = bar.elapsed_td.total_seconds() / 60. + return ret, results + + + def debug(self, batch, output, iter_id): + raise NotImplementedError + + def save_result(self, output, batch, results): + raise NotImplementedError + + def _get_losses(self, opt): + raise NotImplementedError + + def val(self, epoch, data_loader): + return self.run_epoch('val', epoch, data_loader) + + def train(self, epoch, data_loader): + return self.run_epoch('train', epoch, data_loader) diff --git a/src/lib/trains/mot.py b/src/lib/trains/mot.py new file mode 100644 index 0000000..920423a --- /dev/null +++ b/src/lib/trains/mot.py @@ -0,0 +1,110 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from models.decode import mot_decode +from models.losses import FocalLoss +from models.losses import RegL1Loss, RegLoss, NormRegL1Loss, RegWeightedL1Loss +from models.utils import _sigmoid, _tranpose_and_gather_feat +from utils.post_process import ctdet_post_process + +from .base_trainer import BaseTrainer + + +class MotLoss(torch.nn.Module): + def __init__(self, opt): + super(MotLoss, self).__init__() + self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss() + self.crit_reg = RegL1Loss() if opt.reg_loss == 'l1' else \ + RegLoss() if opt.reg_loss == 'sl1' else None + self.crit_wh = torch.nn.L1Loss(reduction='sum') if opt.dense_wh else \ + NormRegL1Loss() if opt.norm_wh else \ + RegWeightedL1Loss() if opt.cat_spec_wh else self.crit_reg + self.opt = opt + self.emb_dim = opt.reid_dim + self.nID = opt.nID + self.classifier = nn.Linear(self.emb_dim, self.nID) + self.IDLoss = nn.CrossEntropyLoss(ignore_index=-1) + #self.TriLoss = TripletLoss() + self.emb_scale = math.sqrt(2) * math.log(self.nID - 1) + self.s_det = nn.Parameter(-1.85 * torch.ones(1)) + self.s_id = nn.Parameter(-1.05 * torch.ones(1)) + + def forward(self, outputs, batch): + opt = self.opt + hm_loss, wh_loss, off_loss, id_loss = 0, 0, 0, 0 + for s in range(opt.num_stacks): + output = outputs[s] + if not opt.mse_loss: + output['hm'] = _sigmoid(output['hm']) + + hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks + if opt.wh_weight > 0: + if opt.dense_wh: + mask_weight = batch['dense_wh_mask'].sum() + 1e-4 + wh_loss += ( + self.crit_wh(output['wh'] * batch['dense_wh_mask'], + batch['dense_wh'] * batch['dense_wh_mask']) / + mask_weight) / opt.num_stacks + else: + wh_loss += self.crit_reg( + output['wh'], batch['reg_mask'], + batch['ind'], batch['wh']) / opt.num_stacks + + if opt.reg_offset and opt.off_weight > 0: + off_loss += self.crit_reg(output['reg'], batch['reg_mask'], + batch['ind'], batch['reg']) / opt.num_stacks + + if opt.id_weight > 0: + id_head = _tranpose_and_gather_feat(output['id'], batch['ind']) + id_head = id_head[batch['reg_mask'] > 0].contiguous() + id_head = self.emb_scale * F.normalize(id_head) + id_target = batch['ids'][batch['reg_mask'] > 0] + id_output = self.classifier(id_head).contiguous() + id_loss += self.IDLoss(id_output, id_target) + #id_loss += self.IDLoss(id_output, id_target) + self.TriLoss(id_head, id_target) + + #loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + opt.off_weight * off_loss + opt.id_weight * id_loss + + det_loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + opt.off_weight * off_loss + + loss = torch.exp(-self.s_det) * det_loss + torch.exp(-self.s_id) * id_loss + (self.s_det + self.s_id) + loss *= 0.5 + + #print(loss, hm_loss, wh_loss, off_loss, id_loss) + + loss_stats = {'loss': loss, 'hm_loss': hm_loss, + 'wh_loss': wh_loss, 'off_loss': off_loss, 'id_loss': id_loss} + return loss, loss_stats + + +class MotTrainer(BaseTrainer): + def __init__(self, opt, model, optimizer=None): + super(MotTrainer, self).__init__(opt, model, optimizer=optimizer) + + def _get_losses(self, opt): + loss_states = ['loss', 'hm_loss', 'wh_loss', 'off_loss', 'id_loss'] + loss = MotLoss(opt) + return loss_states, loss + + def save_result(self, output, batch, results): + reg = output['reg'] if self.opt.reg_offset else None + dets = mot_decode( + output['hm'], output['wh'], reg=reg, + cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K) + dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2]) + dets_out = ctdet_post_process( + dets.copy(), batch['meta']['c'].cpu().numpy(), + batch['meta']['s'].cpu().numpy(), + output['hm'].shape[2], output['hm'].shape[3], output['hm'].shape[1]) + results[batch['meta']['img_id'].cpu().numpy()[0]] = dets_out[0] \ No newline at end of file diff --git a/src/lib/trains/train_factory.py b/src/lib/trains/train_factory.py new file mode 100644 index 0000000..a1919fa --- /dev/null +++ b/src/lib/trains/train_factory.py @@ -0,0 +1,15 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from .mot import MotTrainer + + +train_factory = { + 'mot': MotTrainer, +} diff --git a/src/lib/utils/image.py b/src/lib/utils/image.py new file mode 100644 index 0000000..9967a85 --- /dev/null +++ b/src/lib/utils/image.py @@ -0,0 +1,230 @@ +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (Bin.Xiao@microsoft.com) +# Modified by Xingyi Zhou +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import random + +def flip(img): + return img[:, :, ::-1].copy() + +def transform_preds(coords, center, scale, output_size): + target_coords = np.zeros(coords.shape) + trans = get_affine_transform(center, scale, 0, output_size, inv=1) + for p in range(coords.shape[0]): + target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) + return target_coords + + +def get_affine_transform(center, + scale, + rot, + output_size, + shift=np.array([0, 0], dtype=np.float32), + inv=0): + if not isinstance(scale, np.ndarray) and not isinstance(scale, list): + scale = np.array([scale, scale], dtype=np.float32) + + scale_tmp = scale + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = get_dir([0, src_w * -0.5], rot_rad) + dst_dir = np.array([0, dst_w * -0.5], np.float32) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir + + src[2:, :] = get_3rd_point(src[0, :], src[1, :]) + dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + +def get_3rd_point(a, b): + direct = a - b + return b + np.array([-direct[1], direct[0]], dtype=np.float32) + + +def get_dir(src_point, rot_rad): + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + + src_result = [0, 0] + src_result[0] = src_point[0] * cs - src_point[1] * sn + src_result[1] = src_point[0] * sn + src_point[1] * cs + + return src_result + + +def crop(img, center, scale, output_size, rot=0): + trans = get_affine_transform(center, scale, rot, output_size) + + dst_img = cv2.warpAffine(img, + trans, + (int(output_size[0]), int(output_size[1])), + flags=cv2.INTER_LINEAR) + + return dst_img + + +def gaussian_radius(det_size, min_overlap=0.7): + height, width = det_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1) + r1 = (b1 + sq1) / 2 + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2) + r2 = (b2 + sq2) / 2 + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3) + r3 = (b3 + sq3) / 2 + return min(r1, r2, r3) + + +def gaussian2D(shape, sigma=1): + m, n = [(ss - 1.) / 2. for ss in shape] + y, x = np.ogrid[-m:m+1,-n:n+1] + + h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) + h[h < np.finfo(h.dtype).eps * h.max()] = 0 + return h + +def draw_umich_gaussian(heatmap, center, radius, k=1): + diameter = 2 * radius + 1 + gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6) + + x, y = int(center[0]), int(center[1]) + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right] + if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + return heatmap + +def draw_dense_reg(regmap, heatmap, center, value, radius, is_offset=False): + diameter = 2 * radius + 1 + gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6) + value = np.array(value, dtype=np.float32).reshape(-1, 1, 1) + dim = value.shape[0] + reg = np.ones((dim, diameter*2+1, diameter*2+1), dtype=np.float32) * value + if is_offset and dim == 2: + delta = np.arange(diameter*2+1) - radius + reg[0] = reg[0] - delta.reshape(1, -1) + reg[1] = reg[1] - delta.reshape(-1, 1) + + x, y = int(center[0]), int(center[1]) + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_regmap = regmap[:, y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, + radius - left:radius + right] + masked_reg = reg[:, radius - top:radius + bottom, + radius - left:radius + right] + if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug + idx = (masked_gaussian >= masked_heatmap).reshape( + 1, masked_gaussian.shape[0], masked_gaussian.shape[1]) + masked_regmap = (1-idx) * masked_regmap + idx * masked_reg + regmap[:, y - top:y + bottom, x - left:x + right] = masked_regmap + return regmap + + +def draw_msra_gaussian(heatmap, center, sigma): + tmp_size = sigma * 3 + mu_x = int(center[0] + 0.5) + mu_y = int(center[1] + 0.5) + w, h = heatmap.shape[0], heatmap.shape[1] + ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] + br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] + if ul[0] >= h or ul[1] >= w or br[0] < 0 or br[1] < 0: + return heatmap + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, np.newaxis] + x0 = y0 = size // 2 + g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) + g_x = max(0, -ul[0]), min(br[0], h) - ul[0] + g_y = max(0, -ul[1]), min(br[1], w) - ul[1] + img_x = max(0, ul[0]), min(br[0], h) + img_y = max(0, ul[1]), min(br[1], w) + heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum( + heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]], + g[g_y[0]:g_y[1], g_x[0]:g_x[1]]) + return heatmap + +def grayscale(image): + return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + +def lighting_(data_rng, image, alphastd, eigval, eigvec): + alpha = data_rng.normal(scale=alphastd, size=(3, )) + image += np.dot(eigvec, eigval * alpha) + +def blend_(alpha, image1, image2): + image1 *= alpha + image2 *= (1 - alpha) + image1 += image2 + +def saturation_(data_rng, image, gs, gs_mean, var): + alpha = 1. + data_rng.uniform(low=-var, high=var) + blend_(alpha, image, gs[:, :, None]) + +def brightness_(data_rng, image, gs, gs_mean, var): + alpha = 1. + data_rng.uniform(low=-var, high=var) + image *= alpha + +def contrast_(data_rng, image, gs, gs_mean, var): + alpha = 1. + data_rng.uniform(low=-var, high=var) + blend_(alpha, image, gs_mean) + +def color_aug(data_rng, image, eig_val, eig_vec): + functions = [brightness_, contrast_, saturation_] + random.shuffle(functions) + + gs = grayscale(image) + gs_mean = gs.mean() + for f in functions: + f(data_rng, image, gs, gs_mean, 0.4) + lighting_(data_rng, image, 0.1, eig_val, eig_vec) diff --git a/src/lib/utils/post_process.py b/src/lib/utils/post_process.py new file mode 100644 index 0000000..eb39796 --- /dev/null +++ b/src/lib/utils/post_process.py @@ -0,0 +1,27 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from .image import transform_preds + + +def ctdet_post_process(dets, c, s, h, w, num_classes): + # dets: batch x max_dets x dim + # return 1-based class det dict + ret = [] + for i in range(dets.shape[0]): + top_preds = {} + dets[i, :, :2] = transform_preds( + dets[i, :, 0:2], c[i], s[i], (w, h)) + dets[i, :, 2:4] = transform_preds( + dets[i, :, 2:4], c[i], s[i], (w, h)) + classes = dets[i, :, -1] + for j in range(num_classes): + inds = (classes == j) + top_preds[j + 1] = np.concatenate([ + dets[i, inds, :4].astype(np.float32), + dets[i, inds, 4:5].astype(np.float32)], axis=1).tolist() + ret.append(top_preds) + return ret + diff --git a/src/lib/utils/utils.py b/src/lib/utils/utils.py new file mode 100644 index 0000000..34f7cd5 --- /dev/null +++ b/src/lib/utils/utils.py @@ -0,0 +1,179 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import numpy as np + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + if self.count > 0: + self.avg = self.sum / self.count + + +def xyxy2xywh(x): + # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] + x[:, 2]) / 2 + y[:, 1] = (x[:, 1] + x[:, 3]) / 2 + y[:, 2] = x[:, 2] - x[:, 0] + y[:, 3] = x[:, 3] - x[:, 1] + return y + + +def xywh2xyxy(x): + # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] - x[:, 2] / 2) + y[:, 1] = (x[:, 1] - x[:, 3] / 2) + y[:, 2] = (x[:, 0] + x[:, 2] / 2) + y[:, 3] = (x[:, 1] + x[:, 3] / 2) + return y + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ Compute the average precision, given the recall and precision curves. + Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (list). + conf: Objectness value from 0-1 (list). + pred_cls: Predicted object classes (list). + target_cls: True object classes (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # lists/pytorch to numpy + tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls) + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = sum(target_cls == c) # Number of ground truth objects + n_p = sum(i) # Number of predicted objects + + if (n_p == 0) and (n_gt == 0): + continue + elif (n_p == 0) or (n_gt == 0): + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = np.cumsum(1 - tp[i]) + tpc = np.cumsum(tp[i]) + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(tpc[-1] / (n_gt + 1e-16)) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(tpc[-1] / (tpc[-1] + fpc[-1])) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p) + + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def bbox_iou(box1, box2, x1y1x2y2=False): + """ + Returns the IoU of two bounding boxes + """ + N, M = len(box1), len(box2) + if x1y1x2y2: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + else: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + + # get the coordinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1) + inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1) + inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2) + inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2) + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) + # Union Area + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)) + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1,1).expand(N,M) + b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1,-1).expand(N,M) + + return inter_area / (b1_area + b2_area - inter_area + 1e-16) + + +def generate_anchors(nGh, nGw, anchor_wh): + nA = len(anchor_wh) + yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw), indexing='ij') + + mesh = np.stack([xx, yy], axis=0) # Shape 2, nGh, nGw + mesh = np.tile(np.expand_dims(mesh, axis=0), (nA, 1, 1, 1)) # Shape nA x 2 x nGh x nGw + anchor_offset_mesh = np.tile(np.expand_dims(np.expand_dims(anchor_wh, -1), -1), (1, 1, nGh, nGw)) # Shape nA x 2 x nGh x nGw + anchor_mesh = np.concatenate((mesh, anchor_offset_mesh), axis=1) # Shape nA x 4 x nGh x nGw + return anchor_mesh + + +def encode_delta(gt_box_list, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ + gt_box_list[:, 2], gt_box_list[:, 3] + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = np.log(gw/pw) + dh = np.log(gh/ph) + return np.stack((dx, dy, dw, dh), axis=1) diff --git a/src/test_det.py b/src/test_det.py index de49c52..7419758 100644 --- a/src/test_det.py +++ b/src/test_det.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ from __future__ import absolute_import diff --git a/src/test_emb.py b/src/test_emb.py index 67f2a6f..1195326 100644 --- a/src/test_emb.py +++ b/src/test_emb.py @@ -1,9 +1,8 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ - from __future__ import absolute_import from __future__ import division from __future__ import print_function diff --git a/src/track.py b/src/track.py index 488f8d6..034b22a 100644 --- a/src/track.py +++ b/src/track.py @@ -1,6 +1,6 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ from __future__ import absolute_import diff --git a/src/train.py b/src/train.py index ef3dcc8..30c4dce 100644 --- a/src/train.py +++ b/src/train.py @@ -1,7 +1,8 @@ # ------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation -# Licensed under MIT License +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. # ------------------------------------------------------------------------------ + from __future__ import absolute_import from __future__ import division from __future__ import print_function diff --git a/videos/MOT16-03.mp4 b/videos/MOT16-03.mp4 new file mode 100644 index 0000000..f01a157 Binary files /dev/null and b/videos/MOT16-03.mp4 differ