Merge pull request #26 from microsoft/torch17

Torch17
2021-06-28 17:55:14 -07:00 · 2021-06-28 17:55:14 -07:00 · a93180e85c
--- a/INSTALL.md
+++ b/INSTALL.md
@ -1,10 +1,11 @@
 ## Installation

 ### Requirements:
- PyTorch 1.4
+- PyTorch 1.7
 - torchvision
 - cocoapi
- yacs
+- yacs>=0.1.8
+- numpy>=1.19.5
 - matplotlib
 - GCC >= 4.9
 - OpenCV
@ -25,9 +26,10 @@ conda activate sg_benchmark
 conda install ipython h5py nltk joblib jupyter pandas scipy

 # maskrcnn_benchmark and coco api dependencies
-pip install ninja yacs==0.1.8 cython matplotlib tqdm opencv-python numpy=1.19.5
+pip install ninja yacs>=0.1.8 cython matplotlib tqdm opencv-python numpy>=1.19.5

-conda install pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch
+conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch
+conda install -c conda-forge timm einops

 # install pycocotools
 conda install -c conda-forge pycocotools
@ -35,9 +37,6 @@ conda install -c conda-forge pycocotools
 # install cityscapesScripts
 python -m pip install cityscapesscripts

-# install apex
-conda install -c conda-forge nvidia-apex
-
 # install Scene Graph Detection
 git clone https://github.com/microsoft/scene_graph_benchmark
 cd scene_graph_benchmark
--- a/README.md
+++ b/README.md
@ -1,15 +1,12 @@
-# Scene Graph Benchmark in PyTorch 1.4
+# Scene Graph Benchmark in PyTorch 1.7

 **This project is based on [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark)**

-This project aims at providing the necessary building blocks for easily
-creating detection and segmentation models using PyTorch 1.0.
-
 ![alt text](demo/R152FPN_demo.png "from https://storage.googleapis.com/openimages/web/index.html")


 ## Highlights
- **Upgrad to pytorch 1.4 (can also upgrade to 1.7)**
+- **Upgrad to pytorch 1.7**
 - **Multi-GPU training and inference**
 - **Batched inference:** can perform inference using multiple images per batch per GPU.
 - **Fast and flexible tsv dataset format**
@ -35,15 +32,16 @@ Here is how we would do it. Run the following commands:
 # visualize VinVL object detection
 # pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
 # the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json
-python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file ../maskrcnn-benchmark-1/datasets1/imgs/woman_fish.jpg --save_file output/woman_fish_x152c4.obj.jpg MODEL.WEIGHT models/vinvl/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 DATA_DIR "../maskrcnn-benchmark-1/datasets1" TEST.IGNORE_BOX_REGRESSION False
+python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file demo/woman_fish.jpg --save_file output/woman_fish_x152c4.obj.jpg MODEL.WEIGHT pretrained_model/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 TEST.IGNORE_BOX_REGRESSION False

 # visualize VinVL object-attribute detection
 # pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
 # the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json
-python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file ../maskrcnn-benchmark-1/datasets1/imgs/woman_fish.jpg --save_file output/woman_fish_x152c4.attr.jpg --visualize_attr MODEL.WEIGHT models/vinvl/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 DATA_DIR "../maskrcnn-benchmark-1/datasets1" TEST.IGNORE_BOX_REGRESSION False
+python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file demo/woman_fish.jpg --save_file output/woman_fish_x152c4.attr.jpg --visualize_attr MODEL.WEIGHT pretrained_model/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 TEST.IGNORE_BOX_REGRESSION False

 # visualize OpenImage scene graph generation by RelDN
-python tools/demo/demo_image.py --config_file sgg_configs/vrd/R152FPN_vrd_reldn.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa_output.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False
+# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/sgg_model_zoo/sgg_oi_vrd_model_zoo/RX152FPN_reldn_oi_best.pth
+python tools/demo/demo_image.py --config_file sgg_configs/vrd/R152FPN_vrd_reldn.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file output/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.reldn_relation.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False

 # visualize Visual Genome scene graph generation by neural motif
 python tools/demo/demo_image.py --config_file sgg_configs/vg_vrd/rel_danfeiX_FPN50_nm.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa_vgnm.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False DATASETS.LABELMAP_FILE "visualgenome/VG-SGG-dicts-danfeiX-clipped.json" DATA_DIR /home/penzhan/GitHub/maskrcnn-benchmark-1/datasets1 MODEL.ROI_RELATION_HEAD.USE_BIAS True MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP True MODEL.ROI_HEADS.DETECTIONS_PER_IMG 64 MODEL.ROI_RELATION_HEAD.SHARE_BOX_FEATURE_EXTRACTOR False MODEL.ROI_RELATION_HEAD.NEURAL_MOTIF.OBJ_LSTM_NUM_LAYERS 0 MODEL.ROI_RELATION_HEAD.NEURAL_MOTIF.EDGE_LSTM_NUM_LAYERS 2 TEST.IMS_PER_BATCH 2
--- a/demo/predictor.py
+++ b/demo/predictor.py
@ -273,7 +273,7 @@ class COCODemo(object):
                the BoxList via `prediction.fields()`
        """
        scores = predictions.get_field("scores")
-        keep = torch.nonzero(scores > self.confidence_threshold).squeeze(1)
+        keep = torch.nonzero(scores > self.confidence_threshold, as_tuple=False).squeeze(1)
        predictions = predictions[keep]
        scores = predictions.get_field("scores")
        _, idx = scores.sort(0, descending=True)
--- a/demo/woman_fish.jpg
+++ b/demo/woman_fish.jpg
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,7 +1,7 @@
 ARG CUDA="10.1"
 ARG CUDNN="7"

-FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04
+FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu18.04

 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections

@ -36,20 +36,20 @@ RUN pip --no-cache-dir install --force-reinstall -I pyyaml

 RUN python -m nltk.downloader punkt

-# Install latest PyTorch 1.4
+# Install latest PyTorch 1.7.1
 ARG CUDA
-RUN conda install pytorch~=1.4.0 torchvision cudatoolkit=${CUDA} -c pytorch \
+RUN conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch \
 && conda clean -ya
+RUN conda install -y -c conda-forge timm einops

 # install pycocotools
-RUN git clone https://github.com/cocodataset/cocoapi.git \
- && cd cocoapi/PythonAPI \
- && python setup.py build_ext install
+# RUN git clone https://github.com/cocodataset/cocoapi.git \
+#  && cd cocoapi/PythonAPI \
+#  && python setup.py build_ext install
+RUN conda install -y -c conda-forge pycocotools

-# install apex
-RUN git clone https://github.com/NVIDIA/apex.git \
- && cd apex \
- && python setup.py install --cuda_ext --cpp_ext
+# install cityscapesScripts
+RUN python -m pip install cityscapesscripts

 # install PyTorch Detection
 ARG FORCE_CUDA="1"
@ -61,7 +61,7 @@ RUN echo """syntax on\nfiletype indent on\nset autoindent\nset number\ncolorsche

 CMD [ "zsh" ]

-# RUN git clone https://github.com/hanxiaotian/scene_graph_benchmark.git \
+# RUN git clone https://github.com/microsoft/scene_graph_benchmark.git \
 #  && cd scene_graph_benchmark \
 #  && python setup.py build develop

--- a/docker/docker-jupyter/Dockerfile
+++ b/docker/docker-jupyter/Dockerfile
@ -56,11 +56,6 @@ RUN git clone https://github.com/cocodataset/cocoapi.git \
 && cd cocoapi/PythonAPI \
 && python setup.py build_ext install

-# install apex
-RUN git clone https://github.com/NVIDIA/apex.git \
- && cd apex \
- && python setup.py install --cuda_ext --cpp_ext
-
 # install PyTorch Detection
 ARG FORCE_CUDA="1"
 ENV FORCE_CUDA=${FORCE_CUDA}
--- a/maskrcnn_benchmark/config/defaults.py
+++ b/maskrcnn_benchmark/config/defaults.py
@ -178,6 +178,8 @@ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
 _C.MODEL.RPN.FPN_POST_NMS_PER_BATCH = True
 # Custom rpn head, empty to use default conv or separable conv
 _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
+# use gt target box as proposals for roi_heads (shared in training and testing)
+_C.MODEL.RPN.FORCE_BOXES = False


 # ---------------------------------------------------------------------------- #
@ -302,6 +304,28 @@ _C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
 _C.MODEL.RESNETS.WITH_MODULATED_DCN = False
 _C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1

+# ---------------------------------------------------------------------------- #
+# Vision Transformer Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.TRANSFORMER = CN()
+_C.MODEL.TRANSFORMER.DROP = 0.0
+_C.MODEL.TRANSFORMER.DROP_PATH = 0.1
+_C.MODEL.TRANSFORMER.NORM_EMBED = True
+_C.MODEL.TRANSFORMER.AVG_POOL = False
+_C.MODEL.TRANSFORMER.VITHEADARCH = 'l4,h12,d768,n1,s0,g0,p2,f7,a0'
+
+_C.MODEL.TRANSFORMER.MSVIT = CN()
+_C.MODEL.TRANSFORMER.MSVIT.ARCH = 'l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n2,s1,g1,p2,f7,a0_l3,h6,d384,n8,s1,g1,p2,f7,a0_l4,h12,d768,n1,s1,g0,p2,f7,a0'
+_C.MODEL.TRANSFORMER.MSVIT.SHARE_W = True
+_C.MODEL.TRANSFORMER.MSVIT.ATTN_TYPE = 'longformerhand'
+_C.MODEL.TRANSFORMER.MSVIT.SHARE_KV = True
+_C.MODEL.TRANSFORMER.MSVIT.ONLY_GLOBAL = False
+_C.MODEL.TRANSFORMER.MSVIT.SW_EXACT = 0
+_C.MODEL.TRANSFORMER.MSVIT.LN_EPS = 1e-6
+_C.MODEL.TRANSFORMER.MSVIT.MODE = 0
+_C.MODEL.TRANSFORMER.MSVIT.REDRAW_INTERVAL = 1000
+
+_C.MODEL.TRANSFORMER.OUT_FEATURES = []

 # ---------------------------------------------------------------------------- #
 # RetinaNet Options (Follow the Detectron version)
@ -430,6 +454,15 @@ _C.SOLVER.TEST_PERIOD = 0
 # see 2 images per batch
 _C.SOLVER.IMS_PER_BATCH = 16

+_C.SOLVER.USE_AMP = False
+
+_C.SOLVER.OPTIMIZER = 'SGD' # also support ADAMW
+_C.SOLVER.CLIP_GRADIENTS = CN()
+_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+
 # ---------------------------------------------------------------------------- #
 # Specific test options
 # ---------------------------------------------------------------------------- #
@ -487,6 +520,7 @@ _C.TEST.IGNORE_BOX_REGRESSION = False
 _C.OUTPUT_DIR = "."
 _C.DATA_DIR = "./datasets"
 _C.DISTRIBUTED_BACKEND = "nccl"  # could be "nccl", "gloo" or "mpi"
+_C.LOG_LOSS_PERIOD = 20

 _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")

@ -496,6 +530,3 @@ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")

 # Precision of input, allowable: (float32, float16)
 _C.DTYPE = "float32"
-
-# Enable verbosity in apex.amp
-_C.AMP_VERBOSE = False
--- a/maskrcnn_benchmark/csrc/ROIAlign.h
+++ b/maskrcnn_benchmark/csrc/ROIAlign.h
@ -14,7 +14,7 @@ at::Tensor ROIAlign_forward(const at::Tensor& input,
                            const int pooled_height,
                            const int pooled_width,
                            const int sampling_ratio) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
 #else
@ -34,7 +34,7 @@ at::Tensor ROIAlign_backward(const at::Tensor& grad,
                             const int height,
                             const int width,
                             const int sampling_ratio) {
-  if (grad.type().is_cuda()) {
+  if (grad.device().is_cuda()) {
 #ifdef WITH_CUDA
    return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
 #else
--- a/maskrcnn_benchmark/csrc/ROIPool.h
+++ b/maskrcnn_benchmark/csrc/ROIPool.h
@ -13,7 +13,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
                                const float spatial_scale,
                                const int pooled_height,
                                const int pooled_width) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
 #else
@ -34,7 +34,7 @@ at::Tensor ROIPool_backward(const at::Tensor& grad,
                                 const int channels,
                                 const int height,
                                 const int width) {
-  if (grad.type().is_cuda()) {
+  if (grad.device().is_cuda()) {
 #ifdef WITH_CUDA
    return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
 #else
--- a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
+++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
@ -13,7 +13,7 @@ at::Tensor SigmoidFocalLoss_forward(
 		const int num_classes, 
 		const float gamma, 
 		const float alpha) {
-  if (logits.type().is_cuda()) {
+  if (logits.device().is_cuda()) {
 #ifdef WITH_CUDA
    return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
 #else
@ -30,7 +30,7 @@ at::Tensor SigmoidFocalLoss_backward(
 			     const int num_classes,
 			     const float gamma,
 			     const float alpha) {
-  if (logits.type().is_cuda()) {
+  if (logits.device().is_cuda()) {
 #ifdef WITH_CUDA
    return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
 #else
--- a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
+++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
@ -91,7 +91,7 @@ void pre_calc_for_bilinear_interpolate(
          T hy = 1. - ly, hx = 1. - lx;
          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

-          // save weights and indices
+          // save weights and indeces
          PreCalc<T> pc;
          pc.pos1 = y_low * width + x_low;
          pc.pos2 = y_low * width + x_high;
@ -168,8 +168,8 @@ void ROIAlignForward_cpu_kernel(
    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4

-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
    std::vector<PreCalc<T>> pre_calc(
        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
    pre_calc_for_bilinear_interpolate(
@ -224,8 +224,8 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
                                const int pooled_height,
                                const int pooled_width,
                                const int sampling_ratio) {
-  AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
-  AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
+  AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
+  AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");

  auto num_rois = rois.size(0);
  auto channels = input.size(1);
@ -239,10 +239,10 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
    return output;
  }

-  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
    ROIAlignForward_cpu_kernel<scalar_t>(
         output_size,
-         input.data<scalar_t>(),
+         input.data_ptr<scalar_t>(),
         spatial_scale,
         channels,
         height,
@ -250,8 +250,8 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
         pooled_height,
         pooled_width,
         sampling_ratio,
-         rois.data<scalar_t>(),
-         output.data<scalar_t>());
+         rois.data_ptr<scalar_t>(),
+         output.data_ptr<scalar_t>());
  });
  return output;
 }
--- a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
+++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
@ -6,8 +6,8 @@ template <typename scalar_t>
 at::Tensor nms_cpu_kernel(const at::Tensor& dets,
                          const at::Tensor& scores,
                          const float threshold) {
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
-  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
  AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");

  if (dets.numel() == 0) {
@ -26,13 +26,13 @@ at::Tensor nms_cpu_kernel(const at::Tensor& dets,
  auto ndets = dets.size(0);
  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));

-  auto suppressed = suppressed_t.data<uint8_t>();
-  auto order = order_t.data<int64_t>();
-  auto x1 = x1_t.data<scalar_t>();
-  auto y1 = y1_t.data<scalar_t>();
-  auto x2 = x2_t.data<scalar_t>();
-  auto y2 = y2_t.data<scalar_t>();
-  auto areas = areas_t.data<scalar_t>();
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<scalar_t>();
+  auto y1 = y1_t.data_ptr<scalar_t>();
+  auto x2 = x2_t.data_ptr<scalar_t>();
+  auto y2 = y2_t.data_ptr<scalar_t>();
+  auto areas = areas_t.data_ptr<scalar_t>();

  for (int64_t _i = 0; _i < ndets; _i++) {
    auto i = order[_i];
@ -68,7 +68,7 @@ at::Tensor nms_cpu(const at::Tensor& dets,
               const at::Tensor& scores,
               const float threshold) {
  at::Tensor result;
-  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
    result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
  });
  return result;
--- a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
@ -260,8 +260,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
                                 const int pooled_height,
                                 const int pooled_width,
                                 const int sampling_ratio) {
-  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
-  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");

  auto num_rois = rois.size(0);
  auto channels = input.size(1);
@ -272,7 +272,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
  auto output_size = num_rois * pooled_height * pooled_width * channels;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
  dim3 block(512);

  if (output.numel() == 0) {
@ -280,10 +280,10 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
    return output;
  }

-  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
         output_size,
-         input.contiguous().data<scalar_t>(),
+         input.contiguous().data_ptr<scalar_t>(),
         spatial_scale,
         channels,
         height,
@ -291,8 +291,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
         pooled_height,
         pooled_width,
         sampling_ratio,
-         rois.contiguous().data<scalar_t>(),
-         output.data<scalar_t>());
+         rois.contiguous().data_ptr<scalar_t>(),
+         output.data_ptr<scalar_t>());
  });
  THCudaCheck(cudaGetLastError());
  return output;
@ -309,15 +309,15 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
                                  const int height,
                                  const int width,
                                  const int sampling_ratio) {
-  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
-  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");

  auto num_rois = rois.size(0);
  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
  dim3 block(512);

  // handle possibly empty gradients
@ -326,10 +326,10 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
    return grad_input;
  }

-  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
    RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
         grad.numel(),
-         grad.contiguous().data<scalar_t>(),
+         grad.contiguous().data_ptr<scalar_t>(),
         num_rois,
         spatial_scale,
         channels,
@ -338,8 +338,8 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
         pooled_height,
         pooled_width,
         sampling_ratio,
-         grad_input.data<scalar_t>(),
-         rois.contiguous().data<scalar_t>());
+         grad_input.data_ptr<scalar_t>(),
+         rois.contiguous().data_ptr<scalar_t>());
  });
  THCudaCheck(cudaGetLastError());
  return grad_input;
--- a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
@ -112,8 +112,8 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
                                const float spatial_scale,
                                const int pooled_height,
                                const int pooled_width) {
-  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
-  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");

  auto num_rois = rois.size(0);
  auto channels = input.size(1);
@ -126,7 +126,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
  dim3 block(512);

  if (output.numel() == 0) {
@ -134,19 +134,19 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
    return std::make_tuple(output, argmax);
  }

-  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
    RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
         output_size,
-         input.contiguous().data<scalar_t>(),
+         input.contiguous().data_ptr<scalar_t>(),
         spatial_scale,
         channels,
         height,
         width,
         pooled_height,
         pooled_width,
-         rois.contiguous().data<scalar_t>(),
-         output.data<scalar_t>(),
-         argmax.data<int>());
+         rois.contiguous().data_ptr<scalar_t>(),
+         output.data_ptr<scalar_t>(),
+         argmax.data_ptr<int>());
  });
  THCudaCheck(cudaGetLastError());
  return std::make_tuple(output, argmax);
@ -164,8 +164,8 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
                                 const int channels,
                                 const int height,
                                 const int width) {
-  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
-  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
  // TODO add more checks

  auto num_rois = rois.size(0);
@ -173,7 +173,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
  dim3 block(512);

  // handle possibly empty gradients
@ -182,11 +182,11 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
    return grad_input;
  }

-  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
    RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
         grad.numel(),
-         grad.contiguous().data<scalar_t>(),
-         argmax.data<int>(),
+         grad.contiguous().data_ptr<scalar_t>(),
+         argmax.data_ptr<int>(),
         num_rois,
         spatial_scale,
         channels,
@ -194,8 +194,8 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
         width,
         pooled_height,
         pooled_width,
-         grad_input.data<scalar_t>(),
-         rois.contiguous().data<scalar_t>());
+         grad_input.data_ptr<scalar_t>(),
+         rois.contiguous().data_ptr<scalar_t>());
  });
  THCudaCheck(cudaGetLastError());
  return grad_input;
--- a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@ -107,8 +107,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
 		const int num_classes, 
 		const float gamma, 
 		const float alpha) {
-  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
-  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");

  const int num_samples = logits.size(0);
@ -117,8 +117,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
  auto losses_size = num_samples * logits.size(1);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L));
-  
+  dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
  dim3 block(512);

  if (losses.numel() == 0) {
@ -126,16 +125,16 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
    return losses;
  }

-  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
    SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
         losses_size,
-         logits.contiguous().data<scalar_t>(),
-	 targets.contiguous().data<int>(),
+         logits.contiguous().data_ptr<scalar_t>(),
+	 targets.contiguous().data_ptr<int>(),
         num_classes,
 	 gamma,
 	 alpha,
 	 num_samples,
-         losses.data<scalar_t>());
+         losses.data_ptr<scalar_t>());
  });
  THCudaCheck(cudaGetLastError());
  return losses;   
@ -149,9 +148,9 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
 		const int num_classes, 
 		const float gamma, 
 		const float alpha) {
-  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
-  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
-  AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
+  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");

  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");

@ -162,7 +161,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
  auto d_logits_size = num_samples * logits.size(1);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
  dim3 block(512);

  if (d_logits.numel() == 0) {
@ -170,17 +169,17 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
    return d_logits;
  }

-  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
    SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
         d_logits_size,
-         logits.contiguous().data<scalar_t>(),
-	 targets.contiguous().data<int>(),
-	 d_losses.contiguous().data<scalar_t>(),
+         logits.contiguous().data_ptr<scalar_t>(),
+	 targets.contiguous().data_ptr<int>(),
+	 d_losses.contiguous().data_ptr<scalar_t>(),
         num_classes,
 	 gamma,
 	 alpha,
 	 num_samples,
-         d_logits.data<scalar_t>());
+         d_logits.data_ptr<scalar_t>());
  });

  THCudaCheck(cudaGetLastError());
--- a/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
@ -1,10 +1,6 @@
 // modify from
 // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c

-#ifndef AT_CHECK
-#define AT_CHECK TORCH_CHECK 
-#endif
-
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>

@ -73,26 +69,26 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
                 int padW, int dilationH, int dilationW, int group,
                 int deformable_group) 
 {
-  AT_CHECK(weight.ndimension() == 4,
+  TORCH_CHECK(weight.ndimension() == 4,
           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
           "but got: %s",
           weight.ndimension());

-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");

-  AT_CHECK(kW > 0 && kH > 0,
+  TORCH_CHECK(kW > 0 && kH > 0,
           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
           kW);

-  AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
           "kernel size should be consistent with weight, ",
           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
           kW, weight.size(2), weight.size(3));

-  AT_CHECK(dW > 0 && dH > 0,
+  TORCH_CHECK(dW > 0 && dH > 0,
           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);

-  AT_CHECK(
+  TORCH_CHECK(
      dilationW > 0 && dilationH > 0,
      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
      dilationH, dilationW);
@ -108,7 +104,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
    dimw++;
  }

-  AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
           ndim);

  long nInputPlane = weight.size(1) * group;
@ -120,7 +116,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
  long outputWidth =
      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;

-  AT_CHECK(nInputPlane % deformable_group == 0,
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
           "input channels must divide deformable group size");

  if (outputWidth < 1 || outputHeight < 1)
@ -130,27 +126,27 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
        outputWidth);

-  AT_CHECK(input.size(1) == nInputPlane,
+  TORCH_CHECK(input.size(1) == nInputPlane,
           "invalid number of input planes, expected: %d, but got: %d",
           nInputPlane, input.size(1));

-  AT_CHECK((inputHeight >= kH && inputWidth >= kW),
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
           "input image is smaller than kernel");

-  AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
           "invalid spatial size of offset, expected height: %d width: %d, but "
           "got height: %d width: %d",
           outputHeight, outputWidth, offset.size(2), offset.size(3));

-  AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
           "invalid number of channels of offset");

  if (gradOutput != NULL) {
-    AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
+    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
             "invalid number of gradOutput planes, expected: %d, but got: %d",
             nOutputPlane, gradOutput->size(dimf));

-    AT_CHECK((gradOutput->size(dimh) == outputHeight &&
+    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
              gradOutput->size(dimw) == outputWidth),
             "invalid size of gradOutput, expected height: %d width: %d , but "
             "got height: %d width: %d",
@ -201,7 +197,7 @@ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

-  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");

  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                        outputHeight, outputWidth});
@ -308,7 +304,7 @@ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

-  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@ -424,7 +420,7 @@ int deform_conv_backward_parameters_cuda(
  long outputHeight =
      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;

-  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");

  columns = at::zeros(
      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@ -505,8 +501,8 @@ void modulated_deform_conv_cuda_forward(
    const int dilation_w, const int group, const int deformable_group,
    const bool with_bias) 
 {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");

  const int batch = input.size(0);
  const int channels = input.size(1);
@ -587,8 +583,8 @@ void modulated_deform_conv_cuda_backward(
    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
    const bool with_bias) 
 {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");

  const int batch = input.size(0);
  const int channels = input.size(1);
--- a/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
@ -264,10 +264,10 @@ void deformable_im2col(
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.type(), "deformable_im2col_gpu", ([&] {
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        scalar_t *data_col_ = data_col.data<scalar_t>();
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
            num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
@ -358,10 +358,10 @@ void deformable_col2im(
  int channel_per_deformable_group = channels / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.type(), "deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
            num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
@ -456,11 +456,11 @@ void deformable_col2im_coord(
  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.type(), "deformable_col2im_coord_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();

        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
            num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
@ -786,11 +786,11 @@ void modulated_deformable_im2col_cuda(
  const int num_kernels = channels * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
-        scalar_t *data_col_ = data_col.data<scalar_t>();
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();

        modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
            num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
@ -818,11 +818,11 @@ void modulated_deformable_col2im_cuda(
  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();

        modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
            num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
@ -851,13 +851,13 @@ void modulated_deformable_col2im_coord_cuda(
  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
-        scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();

        modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
            num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
--- a/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
@ -5,10 +5,6 @@
 // author: Charles Shang
 // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu

-#ifndef AT_CHECK
-#define AT_CHECK TORCH_CHECK 
-#endif
-
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>

@ -43,7 +39,7 @@ void deform_psroi_pooling_cuda_forward(
    const int output_dim, const int group_size, const int pooled_size,
    const int part_size, const int sample_per_part, const float trans_std) 
 {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");

  const int batch = input.size(0);
  const int channels = input.size(1);
@ -69,8 +65,8 @@ void deform_psroi_pooling_cuda_backward(
    const int group_size, const int pooled_size, const int part_size,
    const int sample_per_part, const float trans_std) 
 {
-  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");

  const int batch = input.size(0);
  const int channels = input.size(1);
--- a/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
@ -290,12 +290,12 @@ void DeformablePSROIPoolForward(const at::Tensor data,
  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data.type(), "deformable_psroi_pool_forward", ([&] {
-        const scalar_t *bottom_data = data.data<scalar_t>();
-        const scalar_t *bottom_rois = bbox.data<scalar_t>();
-        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
-        scalar_t *top_data = out.data<scalar_t>();
-        scalar_t *top_count_data = top_count.data<scalar_t>();
+      data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
+        const scalar_t *bottom_data = data.data_ptr<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
+        scalar_t *top_data = out.data_ptr<scalar_t>();
+        scalar_t *top_count_data = top_count.data_ptr<scalar_t>();

        DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
            count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
@ -341,14 +341,14 @@ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
-        const scalar_t *top_diff = out_grad.data<scalar_t>();
-        const scalar_t *bottom_data = data.data<scalar_t>();
-        const scalar_t *bottom_rois = bbox.data<scalar_t>();
-        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
-        scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
-        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
-        const scalar_t *top_count_data = top_count.data<scalar_t>();
+      out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
+        const scalar_t *top_diff = out_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = data.data_ptr<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
+        scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>();
+        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>();
+        const scalar_t *top_count_data = top_count.data_ptr<scalar_t>();

        DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
            count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
--- a/maskrcnn_benchmark/csrc/cuda/nms.cu
+++ b/maskrcnn_benchmark/csrc/cuda/nms.cu
@ -69,7 +69,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 // boxes is a N x 5 tensor
 at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
  using scalar_t = float;
-  AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
+  AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
  auto scores = boxes.select(1, 4);
  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
  auto boxes_sorted = boxes.index_select(0, order_t);
@ -78,7 +78,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {

  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);

-  scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
+  scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();

  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState

@ -106,7 +106,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);

  at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data<int64_t>();
+  int64_t* keep_out = keep.data_ptr<int64_t>();

  int num_to_keep = 0;
  for (int i = 0; i < boxes_num; i++) {
--- a/maskrcnn_benchmark/csrc/deform_conv.h
+++ b/maskrcnn_benchmark/csrc/deform_conv.h
@ -27,7 +27,7 @@ int deform_conv_forward(
    int deformable_group, 
    int im2col_step)
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return deform_conv_forward_cuda(
        input, weight, offset, output, columns, ones,
@ -62,7 +62,7 @@ int deform_conv_backward_input(
    int deformable_group, 
    int im2col_step)
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return deform_conv_backward_input_cuda(
        input, offset, gradOutput, gradInput, gradOffset, weight, columns,
@ -97,7 +97,7 @@ int deform_conv_backward_parameters(
    float scale, 
    int im2col_step)
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return deform_conv_backward_parameters_cuda(
        input, offset, gradOutput, gradWeight, columns, ones,
@ -133,7 +133,7 @@ void modulated_deform_conv_forward(
    const int deformable_group,
    const bool with_bias)
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return modulated_deform_conv_cuda_forward(
        input, weight, bias, ones, offset, mask, output, columns,
@ -175,7 +175,7 @@ void modulated_deform_conv_backward(
    int deformable_group,
    const bool with_bias)
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return modulated_deform_conv_cuda_backward(
        input, weight, bias, ones, offset, mask, columns, 
--- a/maskrcnn_benchmark/csrc/deform_pool.h
+++ b/maskrcnn_benchmark/csrc/deform_pool.h
@ -23,7 +23,7 @@ void deform_psroi_pooling_forward(
    const int sample_per_part, 
    const float trans_std)
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return deform_psroi_pooling_cuda_forward(
        input, bbox, trans, out, top_count, 
@ -55,7 +55,7 @@ void deform_psroi_pooling_backward(
    const int sample_per_part, 
    const float trans_std) 
 {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
    return deform_psroi_pooling_cuda_backward(
        out_grad, input, bbox, trans, top_count, input_grad, trans_grad,
--- a/maskrcnn_benchmark/csrc/nms.h
+++ b/maskrcnn_benchmark/csrc/nms.h
@ -11,7 +11,7 @@ at::Tensor nms(const at::Tensor& dets,
               const at::Tensor& scores,
               const float threshold) {

-  if (dets.type().is_cuda()) {
+  if (dets.device().is_cuda()) {
 #ifdef WITH_CUDA
    // TODO raise error if not compiled with CUDA
    if (dets.numel() == 0)
--- a/maskrcnn_benchmark/csrc/vision.cpp
+++ b/maskrcnn_benchmark/csrc/vision.cpp
@ -14,7 +14,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
  m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
  m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
-  // dcn-v2
  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
  m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
@ -22,4 +21,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
  m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
  m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
-}
+}
--- a/maskrcnn_benchmark/data/datasets/evaluation/init.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/init.py
@ -6,6 +6,8 @@ from .voc import voc_evaluation
 from .cityscapes import abs_cityscapes_evaluation
 from .sg import sg_evaluation
 from .openimages_vrd import openimages_vrd_evaluation
+from .vg import vg_evaluation
+

 def evaluate(dataset, predictions, output_folder, **kwargs):
    """evaluate dataset using different methods based on dataset type.
@ -28,7 +30,10 @@ def evaluate(dataset, predictions, output_folder, **kwargs):
    elif isinstance(dataset, datasets.OpenImagesVRDTSVDataset):
        return openimages_vrd_evaluation(**args)
    elif isinstance(dataset, datasets.VGTSVDataset):
-        return sg_evaluation(**args)
+        if 'sg_eval' in args and args['sg_eval']:
+            return sg_evaluation(**args)
+        else:
+            return vg_evaluation(**args)
    elif isinstance(dataset, datasets.AbstractDataset):
        return abs_cityscapes_evaluation(**args)
    else:
--- a/maskrcnn_benchmark/data/datasets/evaluation/sg/sg_tsv_eval.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/sg/sg_tsv_eval.py
@ -120,18 +120,18 @@ def evaluate(gt_classes, gt_boxes, gt_rels,
        return (None, None)

    rel_sum = ((gt_rels.sum(1) > 0).int() + (gt_rels.sum(0) > 0).int())
-    ix_w_rel = rel_sum.nonzero().numpy().squeeze()
+    ix_w_rel = rel_sum.nonzero(as_tuple=False).numpy().squeeze()

    # label = (((gt_rel_label.sum(1) == 0).int() + (gt_rel_label.sum(0) == 0).int()) == 2)
-    # change_ix = label.nonzero()
+    # change_ix = label.nonzero(as_tuple=False)

    gt_boxes = gt_boxes.numpy()
    num_gt_boxes = gt_boxes.shape[0]
-    gt_relations = gt_rels.nonzero().numpy()
+    gt_relations = gt_rels.nonzero(as_tuple=False).numpy()
    gt_classes = gt_classes.view(-1, 1).numpy()

    gt_rels_view = gt_rels.contiguous().view(-1)
-    gt_pred_labels = gt_rels_view[gt_rels_view.nonzero().squeeze()].contiguous().view(-1, 1).numpy()
+    gt_pred_labels = gt_rels_view[gt_rels_view.nonzero(as_tuple=False).squeeze()].contiguous().view(-1, 1).numpy()

    num_gt_relations = gt_relations.shape[0]
    if num_gt_relations == 0:
--- a/maskrcnn_benchmark/data/datasets/evaluation/utils.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/utils.py
@ -0,0 +1,119 @@
+import torch
+from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
+
+
+# inspired from Detectron
+def evaluate_box_proposals(
+    predictions, dataset, thresholds=None, area="all", limit=None
+):
+    """Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for image_id, prediction in sorted(predictions.items()):
+        img_info = dataset.get_img_info(image_id)
+        image_width = img_info["width"]
+        image_height = img_info["height"]
+        prediction = prediction.resize((image_width, image_height))
+
+        # deal with ground truth
+        gt_boxes = dataset.get_groundtruth(image_id)
+        # filter out the field "relations"
+        gt_boxes = gt_boxes.copy_with_fields(['attributes', 'labels'])
+        gt_areas = gt_boxes.area()
+
+        if len(gt_boxes) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        if len(prediction) == 0:
+            gt_overlaps.append(_gt_overlaps)
+            continue
+        if "objectness" in prediction.extra_fields:
+            inds = prediction.get_field("objectness").sort(descending=True)[1]
+        elif "scores" in prediction.extra_fields:
+            inds = prediction.get_field("scores").sort(descending=True)[1]
+        else:
+            raise ValueError("Neither objectness nor scores is in the extra_fields!")
+        prediction = prediction[inds]
+
+        if limit is not None and len(prediction) > limit:
+            prediction = prediction[:limit]
+
+        overlaps = boxlist_iou(prediction, gt_boxes)
+
+        for j in range(min(len(prediction), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = torch.cat(gt_overlaps, dim=0)
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
--- a/maskrcnn_benchmark/data/datasets/evaluation/vg/init.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/vg/init.py
@ -0,0 +1,16 @@
+import logging
+
+from .vg_eval import do_vg_evaluation
+
+
+def vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes, **_):
+    logger = logging.getLogger("maskrcnn_benchmark.inference")
+    logger.info("performing vg evaluation, ignored iou_types.")
+    return do_vg_evaluation(
+        dataset=dataset,
+        predictions=predictions,
+        output_folder=output_folder,
+        box_only=box_only,
+        eval_attributes=eval_attributes,
+        logger=logger,
+    )
--- a/maskrcnn_benchmark/data/datasets/evaluation/vg/vg_eval.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/vg/vg_eval.py
@ -0,0 +1,391 @@
+# A modification version from chainercv repository.
+# (See https://github.com/chainer/chainercv/blob/master/chainercv/evaluations/eval_detection_voc.py)
+from __future__ import division
+
+import os
+import numpy as np
+import torch
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.data.datasets.evaluation.utils import evaluate_box_proposals
+
+
+def do_vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes, logger, save_predictions=True):
+    # TODO need to make the use_07_metric format available
+    # for the user to choose
+    # we use int for box_only. 0: False, 1: box for RPN, 2: box for object detection, 
+    if box_only:
+        if box_only == 1:
+            limits = [100, 1000]
+        elif box_only == 2:
+            limits = [36, 99]
+        else:
+            raise ValueError("box_only can be either 0/1/2, but get {0}".format(box_only))
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        result = {}
+        for area, suffix in areas.items():
+            for limit in limits:
+                logger.info("Evaluating bbox proposals@{:d}".format(limit))
+                stats = evaluate_box_proposals(
+                    predictions, dataset, area=area, limit=limit
+                )
+                key_ar = "AR{}@{:d}".format(suffix, limit)
+                key_num_pos = "num_pos{}@{:d}".format(suffix, limit)
+                result[key_num_pos] = stats["num_pos"]
+                result[key_ar] = stats["ar"].item()
+                key_recalls = "Recalls{}@{:d}".format(suffix, limit)
+                # result[key_recalls] = stats["recalls"]
+                print(key_recalls, stats["recalls"])
+                print(key_ar, "ar={:.4f}".format(result[key_ar]))
+                print(key_num_pos, "num_pos={:d}".format(result[key_num_pos]))
+        logger.info(result)
+        logger.info(result)
+        # check_expected_results(result, expected_results, expected_results_sigma_tol)
+        if output_folder and save_predictions:
+            if box_only == 1:
+                torch.save(result, os.path.join(output_folder, "rpn_proposals.pth"))
+            elif box_only == 2:
+                torch.save(result, os.path.join(output_folder, "box_proposals.pth"))
+            else:
+                raise ValueError("box_only can be either 0/1/2, but get {0}".format(box_only))
+        return {"box_proposal": result}
+
+    pred_boxlists = []
+    gt_boxlists = []
+    for image_id, prediction in sorted(predictions.items()):
+        img_info = dataset.get_img_info(image_id)
+        if len(prediction) == 0:
+            continue
+        image_width = img_info["width"]
+        image_height = img_info["height"]
+        prediction = prediction.resize((image_width, image_height))
+        pred_boxlists.append(prediction)
+
+        gt_boxlist = dataset.get_groundtruth(image_id)
+        gt_boxlists.append(gt_boxlist)
+    if eval_attributes:
+        classes = dataset.attributes
+    else:
+        classes = dataset.classes
+    result = eval_detection_voc(
+        pred_boxlists=pred_boxlists,
+        gt_boxlists=gt_boxlists,
+        classes=classes,
+        iou_thresh=0.5,
+        eval_attributes=eval_attributes,
+        use_07_metric=False,
+    )
+    result_str = "mAP: {:.4f}\n".format(result["map"])
+    for i, ap in enumerate(result["ap"]):
+        # if i == 0:  # skip background
+        #     continue
+        # we skipped background in result['ap'], so we need to use i+1
+        if eval_attributes:
+            result_str += "{:<16}: {:.4f}\n".format(
+                dataset.map_attribute_id_to_attribute_name(i+1), ap
+            )
+        else:
+            result_str += "{:<16}: {:.4f}\n".format(
+                dataset.map_class_id_to_class_name(i+1), ap
+            )
+    logger.info(result_str)
+    # return mAP and weighted mAP
+    if eval_attributes:
+        if output_folder and save_predictions:
+            with open(os.path.join(output_folder, "result_attr.txt"), "w") as fid:
+                fid.write(result_str)
+        return {"attr": {"map": result["map"], "weighted map": result["weighted map"]}}
+    else:
+        if output_folder and save_predictions:
+            with open(os.path.join(output_folder, "result_obj.txt"), "w") as fid:
+                fid.write(result_str)
+        return {"obj": {"map": result["map"], "weighted map": result["weighted map"]}}
+
+
+def eval_detection_voc(pred_boxlists, gt_boxlists, classes, iou_thresh=0.5, eval_attributes=False, use_07_metric=False):
+    """Evaluate on voc dataset.
+    Args:
+        pred_boxlists(list[BoxList]): pred boxlist, has labels and scores fields.
+        gt_boxlists(list[BoxList]): ground truth boxlist, has labels field.
+        iou_thresh: iou thresh
+        use_07_metric: boolean
+    Returns:
+        dict represents the results
+    """
+    assert len(gt_boxlists) == len(
+        pred_boxlists
+    ), "Length of gt and pred lists need to be same."
+
+    aps = []
+    nposs = []
+    thresh = []
+
+    for i, classname in enumerate(classes):
+        if classname == "__background__" or classname == "__no_attribute__":
+            continue
+        rec, prec, ap, scores, npos = calc_detection_voc_prec_rec(pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, \
+                                                                  classindex=i, iou_thresh=iou_thresh,
+                                                                  eval_attributes=eval_attributes,
+                                                                  use_07_metric=use_07_metric)
+        # Determine per class detection thresholds that maximise f score
+        # if npos > 1:
+        if npos > 1 and type(scores) != np.int:
+            f = np.nan_to_num((prec * rec) / (prec + rec))
+            thresh += [scores[np.argmax(f)]]
+        else:
+            thresh += [0]
+        aps += [ap]
+        nposs += [float(npos)]
+        print('AP for {} = {:.4f} (npos={:,})'.format(classname, ap, npos))
+        # if pickle:
+        #     with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
+        #         cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap, 
+        #             'scores': scores, 'npos':npos}, f)
+
+    # Set thresh to mean for classes with poor results 
+    thresh = np.array(thresh)
+    avg_thresh = np.mean(thresh[thresh != 0])
+    thresh[thresh == 0] = avg_thresh
+    # if eval_attributes:
+    #     filename = 'attribute_thresholds_' + self._image_set + '.txt'
+    # else:
+    #     filename = 'object_thresholds_' + self._image_set + '.txt'
+    # path = os.path.join(output_dir, filename)       
+    # with open(path, 'wt') as f:
+    #     for i, cls in enumerate(classes[1:]):
+    #         f.write('{:s} {:.3f}\n'.format(cls, thresh[i]))           
+
+    weights = np.array(nposs)
+    weights /= weights.sum()
+    print('Mean AP = {:.4f}'.format(np.mean(aps)))
+    print('Weighted Mean AP = {:.4f}'.format(np.average(aps, weights=weights)))
+    print('Mean Detection Threshold = {:.3f}'.format(avg_thresh))
+    print('~~~~~~~~')
+    print('Results:')
+    for ap, npos in zip(aps, nposs):
+        print('{:.3f}\t{:.3f}'.format(ap, npos))
+    print('{:.3f}'.format(np.mean(aps)))
+    print('~~~~~~~~')
+    print('')
+    print('--------------------------------------------------------------')
+    print('Results computed with the **unofficial** PASCAL VOC Python eval code.')
+    print('--------------------------------------------------------------')
+
+    # pdb.set_trace()
+    return {"ap": aps, "map": np.mean(aps), "weighted map": np.average(aps, weights=weights)}
+
+
+def calc_detection_voc_prec_rec(pred_boxlists, gt_boxlists, classindex, iou_thresh=0.5, eval_attributes=False,
+                                use_07_metric=False):
+    """Calculate precision and recall based on evaluation code of PASCAL VOC.
+    This function calculates precision and recall of
+    predicted bounding boxes obtained from a dataset which has :math:`N`
+    images.
+    The code is based on the evaluation code used in PASCAL VOC Challenge.
+   """
+    class_recs = {}
+    npos = 0
+    image_ids = []
+    confidence = []
+    BB = []
+    for image_index, (gt_boxlist, pred_boxlist) in enumerate(zip(gt_boxlists, pred_boxlists)):
+        pred_bbox = pred_boxlist.bbox.numpy()
+        gt_bbox = gt_boxlist.bbox.numpy()
+        if eval_attributes:
+            gt_label = gt_boxlist.get_field("attributes").numpy()
+            pred_label = pred_boxlist.get_field("attr_labels").numpy()
+            pred_score = pred_boxlist.get_field("attr_scores").numpy()
+        else:
+            gt_label = gt_boxlist.get_field("labels").numpy()
+            pred_label = pred_boxlist.get_field("labels").numpy()
+            pred_score = pred_boxlist.get_field("scores").numpy()
+
+        # get the ground truth information for this class
+        if eval_attributes:
+            gt_mask_l = np.array([classindex in i for i in gt_label])
+        else:
+            gt_mask_l = gt_label == classindex
+        gt_bbox_l = gt_bbox[gt_mask_l]
+        gt_difficult_l = np.zeros(gt_bbox_l.shape[0], dtype=bool)
+        det = [False] * gt_bbox_l.shape[0]
+        npos = npos + sum(~gt_difficult_l)
+        class_recs[image_index] = {'bbox': gt_bbox_l,
+                                   'difficult': gt_difficult_l,
+                                   'det': det}
+
+        # prediction output for each class
+        # pdb.set_trace()
+        if eval_attributes:
+            pred_mask_l = np.logical_and(pred_label == classindex, np.not_equal(pred_score, 0.0)).nonzero()
+            pred_bbox_l = pred_bbox[pred_mask_l[0]]
+            pred_score_l = pred_score[pred_mask_l]
+        else:
+            pred_mask_l = pred_label == classindex
+            pred_bbox_l = pred_bbox[pred_mask_l]
+            pred_score_l = pred_score[pred_mask_l]
+
+        for bbox_tmp, score_tmp in zip(pred_bbox_l, pred_score_l):
+            image_ids.append(image_index)
+            confidence.append(float(score_tmp))
+            BB.append([float(z) for z in bbox_tmp])
+
+    if npos == 0:
+        # No ground truth examples
+        return 0, 0, 0, 0, npos
+
+    if len(confidence) == 0:
+        # No detection examples
+        return 0, 0, 0, 0, npos
+
+    confidence = np.array(confidence)
+    BB = np.array(BB)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = -np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R['bbox'].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                   (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                   (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > iou_thresh:
+            if not R['difficult'][jmax]:
+                if not R['det'][jmax]:
+                    tp[d] = 1.
+                    R['det'][jmax] = 1
+                else:
+                    fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap, sorted_scores, npos
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def calc_detection_voc_ap(prec, rec, use_07_metric=False):
+    """Calculate average precisions based on evaluation code of PASCAL VOC.
+    This function calculates average precisions
+    from given precisions and recalls.
+    The code is based on the evaluation code used in PASCAL VOC Challenge.
+    Args:
+        prec (list of numpy.array): A list of arrays.
+            :obj:`prec[l]` indicates precision for class :math:`l`.
+            If :obj:`prec[l]` is :obj:`None`, this function returns
+            :obj:`numpy.nan` for class :math:`l`.
+        rec (list of numpy.array): A list of arrays.
+            :obj:`rec[l]` indicates recall for class :math:`l`.
+            If :obj:`rec[l]` is :obj:`None`, this function returns
+            :obj:`numpy.nan` for class :math:`l`.
+        use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
+            for calculating average precision. The default value is
+            :obj:`False`.
+    Returns:
+        ~numpy.ndarray:
+        This function returns an array of average precisions.
+        The :math:`l`-th value corresponds to the average precision
+        for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
+        :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
+    """
+
+    n_fg_class = len(prec)
+    ap = np.empty(n_fg_class)
+    for l in range(n_fg_class):
+        if prec[l] is None or rec[l] is None:
+            ap[l] = np.nan
+            continue
+
+        if use_07_metric:
+            # 11 point metric
+            ap[l] = 0
+            for t in np.arange(0.0, 1.1, 0.1):
+                if np.sum(rec[l] >= t) == 0:
+                    p = 0
+                else:
+                    p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
+                ap[l] += p / 11
+        else:
+            # correct AP calculation
+            # first append sentinel values at the end
+            mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
+            mrec = np.concatenate(([0], rec[l], [1]))
+
+            mpre = np.maximum.accumulate(mpre[::-1])[::-1]
+
+            # to calculate area under PR curve, look for points
+            # where X axis (recall) changes value
+            i = np.where(mrec[1:] != mrec[:-1])[0]
+
+            # and sum (\Delta recall) * prec
+            ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+
+    return ap
--- a/maskrcnn_benchmark/data/datasets/relation_tsv.py
+++ b/maskrcnn_benchmark/data/datasets/relation_tsv.py
@ -11,6 +11,11 @@ from .utils.label_loader import LabelLoader
 from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist


+def sort_key_by_val(dic):
+    sorted_dic = sorted(dic.items(), key=lambda kv: kv[1])
+    return [kv[0] for kv in sorted_dic]
+
+
 class RelationTSVDataset(TSVYamlDataset):
    """
    Generic TSV dataset format for Object Detection.
@ -28,7 +33,7 @@ class RelationTSVDataset(TSVYamlDataset):
        self.contrastive_loss_on = kwargs['args'].MODEL.ROI_RELATION_HEAD.CONTRASTIVE_LOSS.USE_FLAG if kwargs['args'] is not None else False
        
        # construct maps
-        jsondict_file = find_file_path_in_yaml(self.cfg.get("labelmap", None), self.root)
+        jsondict_file = find_file_path_in_yaml(self.cfg.get("labelmap", self.cfg.get("jsondict", None)), self.root) # previous version use jsondict
        jsondict = json.load(open(jsondict_file, 'r'))

        self.labelmap = {}
@ -37,18 +42,21 @@ class RelationTSVDataset(TSVYamlDataset):
        self.class_to_ind['__background__'] = 0
        self.ind_to_class = {v:k for k, v in self.class_to_ind.items()}
        self.labelmap['class_to_ind'] = self.class_to_ind
+        self.classes = sort_key_by_val(self.class_to_ind)

        if self.attribute_on:
            self.attribute_to_ind = jsondict['attribute_to_idx']
            self.attribute_to_ind['__no_attribute__'] = 0
            self.ind_to_attribute = {v:k for k, v in self.attribute_to_ind.items()}
            self.labelmap['attribute_to_ind'] = self.attribute_to_ind
+            self.attributes = sort_key_by_val(self.attribute_to_ind)

        if self.relation_on:
            self.relation_to_ind = jsondict['predicate_to_idx']
            self.relation_to_ind['__no_relation__'] = 0
            self.ind_to_relation = {v:k for k, v in self.relation_to_ind.items()}
            self.labelmap['relation_to_ind'] = self.relation_to_ind
+            self.relations = sort_key_by_val(self.relation_to_ind)
        
        if self.is_load_label or self.detector_pre_calculated:
            self.label_loader = LabelLoader(
--- a/maskrcnn_benchmark/engine/inference.py
+++ b/maskrcnn_benchmark/engine/inference.py
@ -237,7 +237,29 @@ def inference(
    total_timer = Timer()
    inference_timer = Timer()
    total_timer.tic()
-    predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer)
+
+    output_pth_name = 'predictions_forcebox.pth' if eval_attributes else 'predictions.pth'
+    if output_folder and os.path.isfile(os.path.join(output_folder, output_pth_name)):
+        logger.info("Predictions.pth file exist in {}, skip computation".format(
+            os.path.join(output_folder, output_pth_name)))
+        if not is_main_process():
+            return
+        if cfg.TEST.SAVE_RESULTS_TO_TSV or not cfg.TEST.SKIP_PERFORMANCE_EVAL:
+            predictions = torch.load(os.path.join(output_folder, output_pth_name))
+    else:
+        if eval_attributes:
+            # change to force_boxes=True mode
+            force_boxes_model = model.force_boxes
+            force_boxes_box = model.roi_heads.box.post_processor.force_boxes
+            model.force_boxes = True
+            model.roi_heads.box.post_processor.force_boxes = True
+            predictions = compute_on_dataset(model, data_loader, device, bbox_aug,
+                                             inference_timer)
+            # return to the original state
+            model.force_boxes = force_boxes_model
+            model.roi_heads.box.post_processor.force_boxes = force_boxes_box
+        else:
+            predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = total_timer.toc()
@ -262,7 +284,7 @@ def inference(
        return

    if output_folder and save_predictions:
-        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))
+        torch.save(predictions, os.path.join(output_folder, output_pth_name))
    
    if output_folder and cfg.TEST.SAVE_RESULTS_TO_TSV:
        logger.info("Convert prediction results to tsv format and save.")
@ -281,11 +303,16 @@ def inference(

    extra_args = dict(
        box_only=box_only,
+        eval_attributes=eval_attributes,
        iou_types=iou_types,
        expected_results=expected_results,
        expected_results_sigma_tol=expected_results_sigma_tol,
        save_predictions=save_predictions
    )
+    if hasattr(cfg.MODEL, 'RELATION_ON'):
+        extra_args['sg_eval'] = cfg.MODEL.RELATION_ON
+    else:
+        extra_args['sg_eval'] = False

    return evaluate(dataset=dataset,
                    predictions=predictions,
--- a/maskrcnn_benchmark/engine/trainer.py
+++ b/maskrcnn_benchmark/engine/trainer.py
@ -13,8 +13,8 @@ from maskrcnn_benchmark.data import make_data_loader
 from maskrcnn_benchmark.utils.comm import get_world_size, synchronize
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger
 from maskrcnn_benchmark.engine.inference import inference
+from maskrcnn_benchmark.utils.amp import autocast, GradScaler

-from apex import amp

 def reduce_loss_dict(loss_dict):
    """
@ -63,6 +63,9 @@ def do_train(
    start_training_time = time.time()
    end = time.time()

+    if cfg.SOLVER.USE_AMP:
+        scaler = GradScaler()
+
    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)
@ -84,7 +87,11 @@ def do_train(
        images = images.to(device)
        # targets = [target.to(device) for target in targets]

-        loss_dict = model(images, targets)
+        if cfg.SOLVER.USE_AMP:
+            with autocast():
+                loss_dict = model(images, targets)
+        else:
+            loss_dict = model(images, targets)

        # take care of additional metric besides loss returned from model
        if type(loss_dict) == tuple:
@ -101,12 +108,13 @@ def do_train(
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
-        # # Note: If mixed precision is not used, this ends up doing nothing
-        # # Otherwise apply loss scaling for mixed-precision recipe
-        # with amp.scale_loss(losses, optimizer) as scaled_losses:
-        #     scaled_losses.backward()
-        losses.backward()
-        optimizer.step()
+        if cfg.SOLVER.USE_AMP:
+            scaler.scale(losses).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            losses.backward()
+            optimizer.step()
        scheduler.step()

        batch_time = time.time() - end
@ -116,7 +124,7 @@ def do_train(
        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

-        if iteration % 1 == 0 or iteration == max_iter:
+        if iteration % cfg.LOG_LOSS_PERIOD == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
--- a/maskrcnn_benchmark/layers/batch_norm.py
+++ b/maskrcnn_benchmark/layers/batch_norm.py
@ -2,6 +2,9 @@
 import torch
 from torch import nn

+import torch.distributed as dist
+import maskrcnn_benchmark.utils.comm as comm
+from torch.autograd.function import Function

 class FrozenBatchNorm2d(nn.Module):
    """
@ -17,15 +20,98 @@ class FrozenBatchNorm2d(nn.Module):
        self.register_buffer("running_var", torch.ones(n))

    def forward(self, x):
-        # Cast all fixed parameters to half() if necessary
-        if x.dtype == torch.float16:
-            self.weight = self.weight.half()
-            self.bias = self.bias.half()
-            self.running_mean = self.running_mean.half()
-            self.running_var = self.running_var.half()
-
        scale = self.weight * self.running_var.rsqrt()
        bias = self.bias - self.running_mean * scale
        scale = scale.reshape(1, -1, 1, 1)
        bias = bias.reshape(1, -1, 1, 1)
        return x * scale + bias
+
+
+class AllReduce(Function):
+    @staticmethod
+    def forward(ctx, input):
+        input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
+        # Use allgather instead of allreduce since I don't trust in-place operations ..
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
+    """
+    In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+    when the batch size on each worker is different.
+    (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+    This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+    Note:
+        There isn't a single definition of Sync BatchNorm.
+
+        When ``stats_mode==""``, this module computes overall statistics by using
+        statistics of each worker with equal weight.  The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+        When ``stats_mode=="N"``, this module computes overall statistics by weighting
+        the statistics of each worker by their ``N``. The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (H, W). It is slower than ``stats_mode==""``.
+
+        Even though the result of this module may not be the true statistics of all samples,
+        it may still be reasonable because it might be preferrable to assign equal weights
+        to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+        on larger images. From preliminary experiments, little difference is found between such
+        a simplified implementation and an accurate computation of overall mean & variance.
+    """
+
+    def __init__(self, *args, stats_mode="", **kwargs):
+        super().__init__(*args, **kwargs)
+        assert stats_mode in ["", "N"]
+        self._stats_mode = stats_mode
+
+    def forward(self, input):
+        if comm.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        B, C = input.shape[0], input.shape[1]
+
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        if self._stats_mode == "":
+            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+            vec = torch.cat([mean, meansqr], dim=0)
+            vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+            mean, meansqr = torch.split(vec, C)
+            momentum = self.momentum
+        else:
+            if B == 0:
+                vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+                vec = vec + input.sum()  # make sure there is gradient w.r.t input
+            else:
+                vec = torch.cat(
+                    [mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0
+                )
+            vec = AllReduce.apply(vec * B)
+
+            total_batch = vec[-1].detach()
+            momentum = total_batch.clamp(max=1) * self.momentum  # no update if total_batch is 0
+            total_batch = torch.max(total_batch, torch.ones_like(total_batch))  # avoid div-by-zero
+            mean, meansqr, _ = torch.split(vec / total_batch, C)
+
+        var = meansqr - mean * mean
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+
+        self.running_mean += momentum * (mean.detach() - self.running_mean)
+        self.running_var += momentum * (var.detach() - self.running_var)
+        return input * scale + bias
--- a/maskrcnn_benchmark/layers/nms.py
+++ b/maskrcnn_benchmark/layers/nms.py
@ -1,12 +1,12 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-# from ._utils import _C
 from maskrcnn_benchmark import _C

-# from apex import amp
+try:
+    import torchvision
+    from torchvision.ops import nms
+except:
+    nms = _C.nms

-# Only valid with fp32 inputs - give AMP the hint
-# nms = amp.float_function(_C.nms)
-nms = _C.nms

 # nms.__doc__ = """
 # This function performs Non-maximum suppresion"""
--- a/maskrcnn_benchmark/layers/roi_align.py
+++ b/maskrcnn_benchmark/layers/roi_align.py
@ -7,8 +7,6 @@ from torch.nn.modules.utils import _pair

 from maskrcnn_benchmark import _C

-# from apex import amp
-
 class _ROIAlign(Function):
    @staticmethod
    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
@ -44,8 +42,11 @@ class _ROIAlign(Function):
        )
        return grad_input, None, None, None, None

-
-roi_align = _ROIAlign.apply
+try:
+    import torchvision
+    from torchvision.ops import roi_align
+except:
+    roi_align = _ROIAlign.apply

 class ROIAlign(nn.Module):
    def __init__(self, output_size, spatial_scale, sampling_ratio):
@ -54,7 +55,6 @@ class ROIAlign(nn.Module):
        self.spatial_scale = spatial_scale
        self.sampling_ratio = sampling_ratio

-    # @amp.float_function
    def forward(self, input, rois):
        return roi_align(
            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
--- a/maskrcnn_benchmark/layers/roi_pool.py
+++ b/maskrcnn_benchmark/layers/roi_pool.py
@ -7,7 +7,6 @@ from torch.nn.modules.utils import _pair

 from maskrcnn_benchmark import _C

-# from apex import amp

 class _ROIPool(Function):
    @staticmethod
@ -53,7 +52,6 @@ class ROIPool(nn.Module):
        self.output_size = output_size
        self.spatial_scale = spatial_scale

-    # @amp.float_function
    def forward(self, input, rois):
        return roi_pool(input, rois, self.output_size, self.spatial_scale)

--- a/maskrcnn_benchmark/layers/sigmoid_focal_loss.py
+++ b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py
@ -57,7 +57,6 @@ class SigmoidFocalLoss(nn.Module):
        self.alpha = alpha

    def forward(self, logits, targets):
-        device = logits.device
        if logits.is_cuda:
            loss_func = sigmoid_focal_loss_cuda
        else:
--- a/maskrcnn_benchmark/modeling/backbone/backbone.py
+++ b/maskrcnn_benchmark/modeling/backbone/backbone.py
@ -8,6 +8,7 @@ from maskrcnn_benchmark.modeling import registry
 from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
 from . import fpn as fpn_module
 from . import resnet
+from .msvit import build_msvit_backbone


@registry.BACKBONES.register("R-50-C4")
@ -73,6 +74,15 @@ def build_resnet_fpn_p3p7_backbone(cfg):
    return model


+@registry.BACKBONES.register("ViL-C4")
+def build_vilc4_backbone(cfg):
+    assert len(cfg.MODEL.TRANSFORMER.OUT_FEATURES) == 1, "The number of OUT_FEATURES in ViL-C4 is not 1!"
+    body = build_msvit_backbone(cfg)
+    model = nn.Sequential(OrderedDict([("body", body)]))
+    model.out_channels = body.out_planes
+    return model
+
+
 def build_backbone(cfg):
    assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
        "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
--- a/maskrcnn_benchmark/modeling/backbone/longformer2d.py
+++ b/maskrcnn_benchmark/modeling/backbone/longformer2d.py
@ -0,0 +1,286 @@
+# Copyright (c) 2021 Microsoft Corporation. Licensed under the MIT license.
+# Written by Pengchuan Zhang, penzhan@microsoft.com
+import random
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.layers import trunc_normal_
+from .slidingchunk_2d import slidingchunk_2d, mask_invalid_locations, slidingchunk_2dautograd
+
+
+class Long2DSCSelfAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., w=7, d=1,
+                 autoregressive=False, sharew=False, nglo=1, only_glo=False, exact=0, autograd=False, rpe=False,
+                 mode=0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim ** -0.5
+        self.Nglo = nglo
+        self.only_glo = only_glo
+        if self.only_glo:
+            assert self.Nglo >= 1, "Nglo == 0 in the only global mode!"
+
+        self.query = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        if nglo >= 1:
+            if sharew:
+                self.query_global = self.query
+                self.kv_global = self.kv
+                self.proj_global = self.proj
+            else:
+                self.query_global = nn.Linear(dim, dim, bias=qkv_bias)
+                self.kv_global = nn.Linear(dim, dim * 2, bias=qkv_bias)
+                self.proj_global = nn.Linear(dim, dim)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.attention_window = w
+        self.attention_dilation = d
+        self.autoregressive = autoregressive
+
+        assert self.attention_dilation == 1, "Dilation is not supported!"
+        assert not self.autoregressive, "Autoregressive is not supported yet!"
+        self.exact = exact
+        # use autograd or handgrad
+        self.longform2d_mm = slidingchunk_2dautograd if autograd else slidingchunk_2d
+
+        # Inspired by swin transformer:
+        # https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L88-L103
+        # define parameter tables for local and global relative position bias
+        self.rpe = rpe
+        if rpe:
+            self.local_relative_position_bias_table = nn.Parameter(
+                torch.zeros((2 * 2 * w - 1) * (2 * 2 * w - 1), num_heads))  # (4*w-1, 4*w-1, nH)
+            trunc_normal_(self.local_relative_position_bias_table, std=.02)
+            if nglo >= 1:
+                self.g2l_relative_position_bias = nn.Parameter(
+                    torch.zeros(2, num_heads, nglo))  # (2, nH, nglo)
+                self.g2g_relative_position_bias = nn.Parameter(
+                    torch.zeros(num_heads, nglo, nglo))  # (nH, nglo, nglo)
+                trunc_normal_(self.g2l_relative_position_bias, std=.02)
+                trunc_normal_(self.g2g_relative_position_bias, std=.02)
+
+            # get pair-wise relative position index
+            coords_h = torch.arange(-w, 2*w)
+            coords_w = torch.arange(-w, 2*w)
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, 3w, 3w
+            coords_unfold = rearrange(
+                coords, 'c (m x) (n y) -> c m n (x y)', x=w, y=w
+            )  # 2, 3, 3, 9w^2
+            q_coords = coords_unfold[:, 1, 1, :] # 2, w^2
+            relative_coords = torch.cat([
+                # -1, -1
+                q_coords[:, :, None] - coords_unfold[:, 0, 0, :][:, None, :],
+                # -1, 0
+                q_coords[:, :, None] - coords_unfold[:, 0, 1, :][:, None, :],
+                # -1, 1
+                q_coords[:, :, None] - coords_unfold[:, 0, 2, :][:, None, :],
+                # 0,-1
+                q_coords[:, :, None] - coords_unfold[:, 1, 0, :][:, None, :],
+                # 0,0
+                q_coords[:, :, None] - q_coords[:, None, :],
+                # 0,1
+                q_coords[:, :, None] - coords_unfold[:, 1, 2, :][:, None, :],
+                # 1, -1
+                q_coords[:, :, None] - coords_unfold[:, 2, 0, :][:, None, :],
+                # 1, 0
+                q_coords[:, :, None] - coords_unfold[:, 2, 1, :][:, None, :],
+                # 1, 1
+                q_coords[:, :, None] - coords_unfold[:, 2, 2, :][:, None, :],
+            ], dim=-1)  # 2, w^2, 9w^2
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # w^2, 9w^2, 2
+            relative_coords[:, :, 0] += 2 * w - 1  # shift to start from 0
+            relative_coords[:, :, 1] += 2 * w - 1
+            relative_coords[:, :, 0] *= 2 * 2 * w - 1
+            relative_position_index = relative_coords.sum(-1)  # w^2, 9w^2
+            self.register_buffer("relative_position_index", relative_position_index)
+
+        # mode to control the sampling strategy of neighbor blocks
+        # 0: all 8 blocks; -1: no neighbor block; >0: random sample one block
+        self.mode = mode
+
+    def forward(self, x, nx, ny):
+        B, N, C = x.shape
+        Nloc = nx * ny
+        Nglo, H, M, W = self.Nglo, self.num_heads, self.head_dim, self.attention_window
+        W2 = W ** 2
+        assert Nglo + Nloc == N, "Global dimension does not match!"
+
+        # get the mode of the longformer attention
+        mode = self.mode
+        kv_nums = 9 * W2
+        if self.mode > 0:
+            if self.training:
+                mode = random.randrange(1, 9)  # 1 <= mode <= 8
+                kv_nums = 2 * W2
+            else:
+                mode = 0  # full during evaluation
+        elif mode == -1:
+            kv_nums = W2
+
+        # compute the local attention
+        q = self.scale * self.query(x[:, Nglo:]).reshape(B, Nloc, H, M).transpose(1, 2).contiguous()
+        kv = self.kv(x).reshape(B, N, 2, H, M).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]  # make torchscript happy (cannot use tensor as tuple)
+
+        if self.only_glo:
+            # local to global attn10: (B, self.num_heads, Nloc, Nglo)
+            attn1 = torch.bmm(q.view(B*H, Nloc, M), k[:, :, :Nglo].reshape(B*H, Nglo, M).transpose(-2, -1)).view(B, H, Nloc, Nglo)
+        else:
+            (q_img, k_img, v_img) = map(
+                lambda t: rearrange(t, 'b h (x y) c -> (b h) c x y', x=nx),
+                (q, k[:, :, Nglo:], v[:, :, Nglo:]))
+            # pad 0's to make sure that nx % W == 0, ny % W == 0
+            (padx, pady) = map(lambda t: (W - t % W) % W, (nx, ny))
+            (mx, my) = map(lambda t: (t[0] + t[1]) // W,
+                           ((nx, padx), (ny, pady)))
+            if padx > 0 or pady > 0:
+                (q_img, k_img, v_img) = map(
+                    lambda t: F.pad(t, (0, pady, 0, padx)), (q_img, k_img, v_img)
+                )
+            # unfold the padded tensor
+            (q_img, k_img, v_img) = map(
+                lambda t: rearrange(t, 'b c (m x) (n y) -> b c m n (x y)', x=W, y=W),
+                (q_img, k_img, v_img)
+            )
+
+            # local to global attn10: (B*H, mx, my, w^2, Nglo)
+            attn10 = einsum('b c m n l, b t c -> b m n l t', q_img,
+                       k[:, :, :Nglo].reshape(B*H, Nglo, M))
+            # local to local attn11： (B*H, mx, my, W**2, 9*W**2), mode = 0
+            # attn11： (B*H, mx, my, W**2, W**2), mode = -1
+            # attn11： (B*H, mx, my, W**2, 2*W**2), mode > 0
+            attn11 = self.longform2d_mm(q_img, k_img, False, mode)
+
+            if self.rpe:
+                if Nglo >= 1:
+                    # local to global bias
+                    attn10 = attn10 + self.g2l_relative_position_bias[1].unsqueeze(0).expand(B, -1, -1).reshape(B*H, Nglo)[:, None, None, None, :]
+                # local to local bias
+                if mode == -1:
+                    relative_position_index = self.relative_position_index[:, 4 * W2:5 * W2].contiguous()
+                elif mode == 0:
+                    relative_position_index = self.relative_position_index
+                else:  # mode > 0
+                    chunk_id = mode if mode > 4 else mode - 1
+                    relative_position_index = torch.cat([
+                        self.relative_position_index[:, 4 * W2:5 * W2],
+                        self.relative_position_index[:, chunk_id * W2:(chunk_id+1) * W2],
+                    ], dim=-1)
+                local_relative_position_bias = self.local_relative_position_bias_table[
+                    relative_position_index.view(-1)].view(1, W2, kv_nums, -1)  # w^2, kv_nums,H
+                local_relative_position_bias = local_relative_position_bias.permute(
+                    0, 3, 1, 2).expand(B, -1, -1, -1).contiguous().view(B*H, W2, kv_nums)  # B*H, w^2, kv_nums
+                attn11 = attn11 + local_relative_position_bias[:, None, None, :, :]
+
+            num_invalid = mask_invalid_locations(
+                attn11, mx, my, padx, pady, W, exact=self.exact, mode=mode
+            )
+            attn1 = torch.cat((attn10, attn11), dim=-1)
+
+        attn1 = (attn1 - torch.max(attn1, dim=-1, keepdim=True)[0]).softmax(dim=-1)
+        attn1 = self.attn_drop(attn1)
+
+        # update x1: (B, self.num_heads, Nloc, self.head_dim)
+        if self.only_glo:
+            x1 = torch.bmm(
+                attn1.view(B * H, Nloc, Nglo), v[:, :, :Nglo].reshape(B * H, Nglo, M)
+            ).view(B, H, Nloc, M)
+        else:
+            attnl2g = attn1[:, :, :, :, :Nglo]
+            x1 = self.longform2d_mm(attn1[:, :, :, :, Nglo:Nglo+kv_nums], v_img, True, mode)
+            if Nglo >= 1:
+                x1 = x1 + einsum(
+                    'b m n l t, b t c -> b c m n l', attnl2g,
+                    v[:, :, :Nglo].reshape(B * H, Nglo, M)
+                )
+            x1 = rearrange(x1, 'b c m n (x y) -> b (m x) (n y) c', x=W)
+            x1 = x1[:, :nx, :ny].reshape(B, H, Nloc, M)
+        x1 = x1.transpose(1, 2).reshape(B, Nloc, C)
+
+        try:
+            x1 = self.proj(x1)
+        except RuntimeError as e:
+            # guard against possible half vs float error
+            x1 = self.proj(x1.float())
+
+        if Nglo == 0:
+            return self.proj_drop(x1)
+
+        # compute the glocal attention; same with vanilla multi-head attention
+        q_global = self.scale * self.query_global(x[:, :Nglo]).reshape(B, Nglo, H, M).transpose(1, 2)
+        kv_global = self.kv_global(x).reshape(B, N, 2, H, M).permute(2, 0, 3, 1, 4)
+        k_global, v_global = kv_global[0], kv_global[1]  # make torchscript happy (cannot use tensor as tuple)
+        # attention matrix
+        attn0 = torch.bmm(q_global.reshape(B*H, Nglo, M), k_global.reshape(B*H, N, M).transpose(-2, -1))
+        if self.rpe:
+            # relative position embedding of global tokens
+            global_relative_position_bias = torch.cat([
+                self.g2g_relative_position_bias,
+                self.g2l_relative_position_bias[0].unsqueeze(-1).expand(-1, -1, Nloc)
+            ], dim=-1)  # nH, nglo, N
+            attn0 = attn0 + global_relative_position_bias.unsqueeze(0).expand(B, -1, -1, -1).reshape(B*H, Nglo, N)
+
+        attn0 = (attn0 - torch.max(attn0, dim=-1, keepdim=True)[0]).softmax(dim=-1)
+        attn0 = self.attn_drop(attn0)
+        # context vector
+        x0 = torch.bmm(attn0, v_global.reshape(B*H, N, M)).view(B, H, Nglo, M).transpose(1, 2).reshape(B, Nglo, C)
+        x0 = self.proj_global(x0)
+
+        return self.proj_drop(torch.cat((x0, x1), dim=1))
+
+    @staticmethod
+    def compute_macs(module, input, output):
+        # T: num_token
+        # S: num_token
+        input = input[0]
+        _, T, C = input.shape
+        S = T
+        Nglo, H, M, W = module.Nglo, module.num_heads, module.head_dim, module.attention_window
+        macs = 0
+        n_params = 0
+
+        # Sliding window scaled-dot-product macs
+        if module.only_glo:
+            # local to global
+            # [B x T x (C-Nglo)] x [B x C x Nglo] --> [B x T x Nglo]
+            num_macs_kq = (C - Nglo) * Nglo * C
+        else:
+            # local to local
+            # [B x T x (C-Nglo)] x [B x C x (S-Nglo)] --> [B x (C-Nglo) x (9 * W**2)]
+            num_macs_kq = (C-Nglo) * (9 * W**2) * C
+            # local to global
+            # [B x T x (C-Nglo)] x [B x C x Nglo] --> [B x T x Nglo]
+            num_macs_kq += (C-Nglo) * Nglo * C
+        # global to all
+        # [B x T x Nglo] x [B x C x S] --> [B x Nglo x S]
+        num_macs_kq += Nglo * S * C
+        # same computational cost for attn * v -> context
+        num_macs_v = num_macs_kq
+
+        macs += num_macs_kq + num_macs_v
+        # print('macs att', macs / 1e8)
+
+        # self attention: T should be equal to S
+        assert T == S
+        # by default, we share weights for local and global tokens
+        q_params = sum([p.numel() for p in module.query.parameters()])
+        kv_params = sum([p.numel() for p in module.kv.parameters()])
+        n_params += q_params + kv_params
+        # multiply by Seq length
+        macs += (q_params + kv_params) * T
+        # print('macs qkv', qkv_params * T / 1e8)
+
+        # by default, we share weights for local and global tokens
+        proj_params = sum([p.numel() for p in module.proj.parameters()])
+        n_params += proj_params
+        macs += (proj_params * T)
+        # print('macs proj', proj_params * T / 1e8)
+
+        module.__flops__ += macs
+        # return n_params, macs
--- a/maskrcnn_benchmark/modeling/backbone/msvit.py
+++ b/maskrcnn_benchmark/modeling/backbone/msvit.py
@ -0,0 +1,657 @@
+import math
+from functools import partial
+import logging
+import torch
+from torch import nn
+from timm.models.layers import DropPath, trunc_normal_, to_2tuple
+from .longformer2d import Long2DSCSelfAttention
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None,
+                 act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None,
+                 attn_drop=0., proj_drop=0.,
+                 rpe=False, wx=14, wy=14, nglo=1):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        # Inspired by swin transformer:
+        # https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L88-L103
+        # define parameter tables for local and global relative position bias
+        self.rpe = rpe
+        if rpe:
+            self.wx = wx
+            self.wy = wy
+            self.nglo = nglo
+            self.local_relative_position_bias_table = nn.Parameter(
+                torch.zeros((2 * wx - 1) * (2 * wy - 1),
+                            num_heads))  # (2*wx-1, 2*wy-1, nH)
+            trunc_normal_(self.local_relative_position_bias_table, std=.02)
+            if nglo >= 1:
+                self.g2l_relative_position_bias = nn.Parameter(
+                    torch.zeros(2, num_heads, nglo))  # (2, nH, nglo)
+                self.g2g_relative_position_bias = nn.Parameter(
+                    torch.zeros(num_heads, nglo, nglo))  # (nH, nglo, nglo)
+                trunc_normal_(self.g2l_relative_position_bias, std=.02)
+                trunc_normal_(self.g2g_relative_position_bias, std=.02)
+
+            # get pair-wise relative position index
+            coords_h = torch.arange(wx)
+            coords_w = torch.arange(wy)
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, wx, wy
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wx*Wy
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wx*Wy, Wx*Wy
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wx*Wy, Wx*Wy, 2
+            relative_coords[:, :, 0] += wx - 1  # shift to start from 0
+            relative_coords[:, :, 1] += wy - 1
+            relative_coords[:, :, 0] *= 2 * wy - 1
+            relative_position_index = relative_coords.sum(-1)  # Wx*Wy, Wx*Wy
+            self.register_buffer("relative_position_index", relative_position_index)
+
+    def forward(self, x, nx=None, ny=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if self.rpe:
+            assert N == self.nglo + self.wx*self.wy, "For relative position, N != self.nglo + self.wx*self.wy!"
+            local_relative_position_bias = self.local_relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                self.wx*self.wy, self.wx*self.wy, -1)  # Wh*Ww, Wh*Ww,nH
+            relative_position_bias = local_relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            if self.nglo > 0:
+                # relative position embedding of global tokens
+                global_relative_position_bias = torch.cat([
+                    self.g2g_relative_position_bias,
+                    self.g2l_relative_position_bias[0].unsqueeze(-1).expand(-1, -1, self.wx*self.wy)
+                ], dim=-1)  # nH, nglo, N
+                # relative position embedding of local tokens
+                local_relative_position_bias = torch.cat([
+                    self.g2l_relative_position_bias[1].unsqueeze(1).expand(-1, self.wx*self.wy, -1),
+                    relative_position_bias,
+                ], dim=-1)  # nH, Wh*Ww, N
+                relative_position_bias = torch.cat([
+                    global_relative_position_bias,
+                    local_relative_position_bias,
+                ], dim=1)  # nH, N, N
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        attn = (attn - torch.max(attn, dim=-1, keepdim=True)[0]).softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def compute_macs(module, input, output):
+        # T: num_token
+        # S: num_token
+        input = input[0]
+        _, T, C = input.shape
+        S = T
+        macs = 0
+        n_params = 0
+
+        # Scaled-dot-product macs
+        # [B x T x C] x [B x C x S] --> [B x T x S]
+        # multiplication-addition is counted as 1 because operations can be fused
+        num_macs_kq = T * S * C
+        # [B x T x S] x [B x S x C] --> [B x T x C]
+        num_macs_v = T * C * S
+
+        macs += num_macs_kq + num_macs_v
+        # print('macs att', macs / 1e8)
+
+        # self attention: T should be equal to S
+        assert T == S
+        qkv_params = sum([p.numel() for p in module.qkv.parameters()])
+        n_params += qkv_params
+        # multiply by Seq length
+        macs += qkv_params * T
+        # print('macs qkv', qkv_params * T / 1e8)
+
+        proj_params = sum([p.numel() for p in module.proj.parameters()])
+        n_params += proj_params
+        macs += (proj_params * T)
+        # print('macs proj', proj_params * T / 1e8)
+
+        module.__flops__ += macs
+        # return n_params, macs
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, patch_size, nx, ny, in_chans=3, embed_dim=768, nglo=1,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6), norm_embed=True,
+                 drop_rate=0.0, ape=True):
+        # maximal global/x-direction/y-direction tokens: nglo, nx, ny
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size,
+                              stride=patch_size)
+
+        self.norm_embed = norm_layer(embed_dim) if norm_embed else None
+
+        self.nx = nx
+        self.ny = ny
+        self.Nglo = nglo
+        if nglo >= 1:
+            self.cls_token = nn.Parameter(torch.zeros(1, nglo, embed_dim))
+            trunc_normal_(self.cls_token, std=.02)
+        else:
+            self.cls_token = None
+        self.ape = ape
+        if ape:
+            self.cls_pos_embed = nn.Parameter(torch.zeros(1, nglo, embed_dim))
+            self.x_pos_embed = nn.Parameter(torch.zeros(1, nx, embed_dim // 2))
+            self.y_pos_embed = nn.Parameter(torch.zeros(1, ny, embed_dim // 2))
+            trunc_normal_(self.cls_pos_embed, std=.02)
+            trunc_normal_(self.x_pos_embed, std=.02)
+            trunc_normal_(self.y_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+    def forward(self, xtuple):
+        x, nx, ny = xtuple
+        B = x.shape[0]
+
+        x = self.proj(x)
+        nx, ny = x.shape[-2:]
+        x = x.flatten(2).transpose(1, 2)
+        assert nx <= self.nx and ny <= self.ny, "Input size {} {} should <= {} {}!".format(nx, ny, self.nx, self.ny)
+
+        if self.norm_embed:
+            x = self.norm_embed(x)
+
+        # concat cls_token
+        if self.cls_token is not None:
+            cls_tokens = self.cls_token.expand(
+                B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        if self.ape:
+            # add position embedding
+            i = torch.arange(nx, device=x.device)
+            j = torch.arange(ny, device=x.device)
+            x_emb = self.x_pos_embed[:, i, :]
+            y_emb = self.y_pos_embed[:, j, :]
+            pos_embed_2d = torch.cat([
+                x_emb.unsqueeze(2).expand(-1, -1, ny, -1),
+                y_emb.unsqueeze(1).expand(-1, nx, -1, -1),
+            ], dim=-1).flatten(start_dim=1, end_dim=2)
+            x = x + torch.cat([self.cls_pos_embed, pos_embed_2d], dim=1).expand(
+                B, -1, -1)
+
+        x = self.pos_drop(x)
+
+        return x, nx, ny
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+# for Performer, start
+def get_module_device(module):
+    return next(module.parameters()).device
+
+
+def find_modules(nn_module, type):
+    return [module for module in nn_module.modules() if isinstance(module, type)]
+
+# for Performer, end
+
+
+class AttnBlock(nn.Module):
+    """ Meta Attn Block
+    """
+    def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0.,
+                 attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm,
+                 attn_type='full', w=7, d=1, sharew=False, nglo=1,
+                 only_glo=False,
+                 seq_len=None, num_feats=256, share_kv=False, sw_exact=0,
+                 rratio=2, rpe=False, wx=14, wy=14,
+                 mode=0):
+        super().__init__()
+        self.norm = norm_layer(dim)
+        if attn_type == 'full':
+            self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
+                                  qk_scale=qk_scale, attn_drop=attn_drop,
+                                  proj_drop=drop,
+                                  rpe=rpe, wx=wx, wy=wy, nglo=nglo)
+        elif attn_type == 'longformerhand':
+            self.attn = Long2DSCSelfAttention(
+                dim, exact=sw_exact, num_heads=num_heads, qkv_bias=qkv_bias,
+                qk_scale=qk_scale, attn_drop=attn_drop,
+                proj_drop=drop, w=w, d=d, sharew=sharew,
+                nglo=nglo, only_glo=only_glo, autograd=False,
+                rpe=rpe, mode=mode
+            )
+        elif attn_type == 'longformerauto':
+            self.attn = Long2DSCSelfAttention(
+                dim, exact=sw_exact, num_heads=num_heads, qkv_bias=qkv_bias,
+                qk_scale=qk_scale, attn_drop=attn_drop,
+                proj_drop=drop, w=w, d=d, sharew=sharew,
+                nglo=nglo, only_glo=only_glo, autograd=True,
+                rpe=rpe, mode=mode
+            )
+        else:
+            raise ValueError(
+                "Not supported attention type {}".format(attn_type))
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, xtuple):
+        x, nx, ny = xtuple
+        x = x + self.drop_path(self.attn(self.norm(x), nx, ny))
+        return x, nx, ny
+
+
+class MlpBlock(nn.Module):
+    """ Meta MLP Block
+    """
+
+    def __init__(self, dim, out_dim=None, mlp_ratio=4., drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       out_features=out_dim, act_layer=act_layer, drop=drop)
+        self.shortcut = nn.Identity()
+        if out_dim is not None and out_dim != dim:
+            self.shortcut = nn.Sequential(nn.Linear(dim, out_dim),
+                                          nn.Dropout(drop))
+
+    def forward(self, xtuple):
+        x, nx, ny = xtuple
+        x = self.shortcut(x) + self.drop_path(self.mlp(self.norm(x)))
+        return x, nx, ny
+
+
+def parse_arch(layer_cfgstr):
+    layer_cfg = {'l': 1, 'h': 3, 'd': 192, 'n': 1, 's': 1, 'g': 1,
+                     'p': 2, 'f': 7, 'a': 0}  # defaults
+    for attr in layer_cfgstr.split(','):
+        layer_cfg[attr[0]] = int(attr[1:])
+    return layer_cfg
+
+
+class MsViT(nn.Module):
+    """ Multiscale Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, arch, img_size=512, in_chans=3,
+                 num_classes=1000,
+                 qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_embed=False, w=7, d=1, sharew=False, only_glo=False,
+                 share_kv=False,
+                 attn_type='longformerhand', sw_exact=0, mode=0,
+                 out_features=None,
+                 freeze_at=0, #detectron2
+                 **args):
+        super().__init__()
+        self.num_classes = num_classes
+
+        if 'ln_eps' in args:
+            ln_eps = args['ln_eps']
+            self.norm_layer = partial(nn.LayerNorm, eps=ln_eps)
+            logging.info("Customized LayerNorm EPS: {}".format(ln_eps))
+        else:
+            self.norm_layer = norm_layer
+        self.drop_path_rate = drop_path_rate
+        self.attn_type = attn_type
+
+        self.attn_args = dict({
+            'attn_type': attn_type,
+            'qkv_bias': qkv_bias,
+            'qk_scale': qk_scale,
+            'drop': drop_rate,
+            'attn_drop': attn_drop_rate,
+            'w': w,
+            'd': d,
+            'sharew': sharew,
+            'only_glo': only_glo,
+            'share_kv': share_kv,
+            'sw_exact': sw_exact,
+            'norm_layer': norm_layer,
+            'mode': mode,
+        })
+        self.patch_embed_args = dict({
+            'norm_layer': norm_layer,
+            'norm_embed': norm_embed,
+            'drop_rate': drop_rate,
+        })
+        self.mlp_args = dict({
+            'mlp_ratio': 4.0,
+            'norm_layer': norm_layer,
+            'act_layer': nn.GELU,
+            'drop': drop_rate,
+        })
+
+        # Attributes for maskrcnn
+        assert out_features, "out_features is empty!"
+        self._out_feature_strides = []
+        self._out_feature_channels = []
+        self._out_features = out_features
+        self.frozen_stages = freeze_at
+
+        self.layer_cfgs = [parse_arch(layer) for layer in arch.split('_')]
+        self.num_layers = len(self.layer_cfgs)
+        self.depth = sum([cfg['n'] for cfg in self.layer_cfgs])
+        self.out_planes = self.layer_cfgs[-1]['d']
+        self.Nglos = [cfg['g'] for cfg in self.layer_cfgs]
+        self.avg_pool = args['avg_pool'] if 'avg_pool' in args else False
+
+        # ensure divisibility
+        stride = 1
+        down_strides = []
+        for cfg in self.layer_cfgs:
+            stride *= cfg['p']
+            down_strides.append(stride)
+        self._size_divisibility = stride
+        self.Nx = (img_size + (stride - 1)) // stride * stride
+        self.Ny = (img_size + (stride - 1)) // stride * stride
+
+        dprs = torch.linspace(0, drop_path_rate, self.depth).split(
+            [cfg['n'] for cfg in self.layer_cfgs]
+        )  # stochastic depth decay rule
+        self.layer1 = self._make_layer(in_chans, self.layer_cfgs[0],
+                                       dprs=dprs[0], layerid=1)
+        if "layer1" in self._out_features:
+            self._out_feature_strides.append(down_strides[0])
+            self._out_feature_channels.append(self.layer_cfgs[0]['d'])
+
+        self.layer2 = self._make_layer(self.layer_cfgs[0]['d'],
+                                       self.layer_cfgs[1], dprs=dprs[1],
+                                       layerid=2)
+        if "layer2" in self._out_features:
+            self._out_feature_strides.append(down_strides[1])
+            self._out_feature_channels.append(self.layer_cfgs[1]['d'])
+
+        self.layer3 = self._make_layer(self.layer_cfgs[1]['d'],
+                                       self.layer_cfgs[2], dprs=dprs[2],
+                                       layerid=3)
+        if "layer3" in self._out_features:
+            self._out_feature_strides.append(down_strides[2])
+            self._out_feature_channels.append(self.layer_cfgs[2]['d'])
+
+        if self.num_layers == 3:
+            self.layer4 = None
+        elif self.num_layers == 4:
+            self.layer4 = self._make_layer(self.layer_cfgs[2]['d'],
+                                           self.layer_cfgs[3], dprs=dprs[3],
+                                           layerid=4)
+            if "layer4" in self._out_features:
+                self._out_feature_strides.append(down_strides[3])
+                self._out_feature_channels.append(self.layer_cfgs[3]['d'])
+        else:
+            raise ValueError("Numer of layers {} not implemented yet!".format(self.num_layers))
+
+        assert self._size_divisibility==stride, "Some stride down layer has been ignored!"
+
+        self.apply(self._init_weights)
+
+    def _freeze_stages(self):
+        if self.frozen_stages <= 0:
+            return
+
+        if self.frozen_stages >= 1:
+            # froze the first patch embeding layer
+            self.layer1[0].eval()
+            for param in self.layer1[0].parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            # froze layer1 to layer{frozen_stages-1}
+            for i in range(1, self.frozen_stages):
+                m = getattr(self, "layer" + str(i))
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(MsViT, self).train(mode)
+        self._freeze_stages()
+
+    def reset_vil_mode(self, mode):
+        longformer_attentions = find_modules(self, Long2DSCSelfAttention)
+        for longformer_attention in longformer_attentions:
+            mode_old = longformer_attention.mode
+            if mode_old != mode:
+                longformer_attention.mode = mode
+                logging.info(
+                    "Change vil attention mode from {} to {} in " "layer {}"
+                        .format(mode_old, mode, longformer_attention))
+        return
+
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    def _make_layer(self, in_dim, layer_cfg, dprs, layerid=0):
+        layer_id, num_heads, dim, num_block, is_sparse_attn, nglo, patch_size, num_feats, ape \
+            = layer_cfg['l'], layer_cfg['h'], layer_cfg['d'], layer_cfg['n'], \
+              layer_cfg['s'], layer_cfg['g'], layer_cfg['p'], layer_cfg['f'], \
+              layer_cfg['a']
+        assert layerid == layer_id, "Error in _make_layer: layerid {} does not equal to layer_id {}".format(layerid, layer_id)
+        self.Nx = nx = self.Nx // patch_size
+        self.Ny = ny = self.Ny // patch_size
+        seq_len = nx * ny + nglo
+
+        self.attn_args['nglo'] = nglo
+        self.patch_embed_args['nglo'] = nglo
+        self.attn_args['num_feats'] = num_feats  # shared for linformer and performer
+        self.attn_args['rratio'] = num_feats  # srformer reuses this parameter
+        self.attn_args['w'] = num_feats  # longformer reuses this parameter
+        if is_sparse_attn == 0:
+            self.attn_args['attn_type'] = 'full'
+
+        # patch embedding
+        layers = [
+            PatchEmbed(patch_size, nx, ny, in_chans=in_dim, embed_dim=dim, ape=ape,
+                       **self.patch_embed_args)
+        ]
+        for dpr in dprs:
+            layers.append(AttnBlock(
+                dim, num_heads, drop_path=dpr, seq_len=seq_len, rpe=not ape,
+                wx=nx, wy=ny,
+                **self.attn_args
+            ))
+            layers.append(MlpBlock(dim, drop_path=dpr, **self.mlp_args))
+        return nn.Sequential(*layers)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        no_decay = {'pos_embed', 'cls_token',
+                    'norm.weight', 'norm.bias',
+                    'norm_embed', 'head.bias',
+                    'relative_position'}
+        return no_decay
+
+    def get_classifier(self):
+        return self.head
+
+    def forward(self, x):
+        B = x.shape[0]
+        outputs = []
+        x, nx, ny = self.layer1((x, None, None))
+        if "layer1" in self._out_features:
+            outputs.append(
+                x[:, self.Nglos[0]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+            )
+
+        x = x[:, self.Nglos[0]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+        x, nx, ny = self.layer2((x, nx, ny))
+        if "layer2" in self._out_features:
+            outputs.append(
+                x[:, self.Nglos[1]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+            )
+
+        x = x[:, self.Nglos[1]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+        x, nx, ny = self.layer3((x, nx, ny))
+        if "layer3" in self._out_features:
+            outputs.append(
+                x[:, self.Nglos[2]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+            )
+
+        if self.layer4 is not None:
+            x = x[:, self.Nglos[2]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+            x, nx, ny = self.layer4((x, nx, ny))
+            if "layer4" in self._out_features:
+                outputs.append(
+                    x[:, self.Nglos[3]:].transpose(-2, -1).reshape(B, -1, nx, ny)
+                )
+
+        return outputs
+
+
+def build_msvit_backbone(cfg):
+    args = dict(
+        img_size=cfg.INPUT.MAX_SIZE_TRAIN,
+        drop_rate=cfg.MODEL.TRANSFORMER.DROP,
+        drop_path_rate=cfg.MODEL.TRANSFORMER.DROP_PATH,
+        norm_embed=cfg.MODEL.TRANSFORMER.NORM_EMBED,
+        avg_pool=cfg.MODEL.TRANSFORMER.AVG_POOL,
+        freeze_at=cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT,
+        out_features=cfg.MODEL.TRANSFORMER.OUT_FEATURES
+    )
+    args['arch'] = cfg.MODEL.TRANSFORMER.MSVIT.ARCH
+    args['sharew'] = cfg.MODEL.TRANSFORMER.MSVIT.SHARE_W
+    args['attn_type'] = cfg.MODEL.TRANSFORMER.MSVIT.ATTN_TYPE
+    args['share_kv'] = cfg.MODEL.TRANSFORMER.MSVIT.SHARE_KV
+    args['only_glo'] = cfg.MODEL.TRANSFORMER.MSVIT.ONLY_GLOBAL
+    args['sw_exact'] = cfg.MODEL.TRANSFORMER.MSVIT.SW_EXACT
+    args['ln_eps'] = cfg.MODEL.TRANSFORMER.MSVIT.LN_EPS
+    args['mode'] = cfg.MODEL.TRANSFORMER.MSVIT.MODE
+
+    return MsViT(**args)
+
+
+class ViTHead(nn.Module):
+    def __init__(
+        self,
+        in_dim, layer_cfgstr, input_size=14,
+        qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+        drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        norm_embed=False, **args
+    ):
+        super(ViTHead, self).__init__()
+        if 'ln_eps' in args:
+            ln_eps = args['ln_eps']
+            self.norm_layer = partial(nn.LayerNorm, eps=ln_eps)
+            logging.info("Customized LayerNorm EPS: {}".format(ln_eps))
+        else:
+            self.norm_layer = norm_layer
+        self.drop_path_rate = drop_path_rate
+
+        self.attn_args = dict({
+            'attn_type': 'full', # full attention for head
+            'qkv_bias': qkv_bias,
+            'qk_scale': qk_scale,
+            'drop': drop_rate,
+            'attn_drop': attn_drop_rate,
+            'norm_layer': norm_layer,
+            'drop_path': drop_path_rate,
+        })
+        self.patch_embed_args = dict({
+            'norm_layer': norm_layer,
+            'norm_embed': norm_embed,
+            'drop_rate': drop_rate,
+        })
+        self.mlp_args = dict({
+            'mlp_ratio': 4.0,
+            'norm_layer': norm_layer,
+            'act_layer': nn.GELU,
+            'drop': drop_rate,
+            'drop_path': drop_path_rate,
+        })
+
+        layer_cfg = parse_arch(layer_cfgstr)
+        layer_id, num_heads, dim, num_block, is_sparse_attn, nglo, patch_size, num_feats, ape \
+            = layer_cfg['l'], layer_cfg['h'], layer_cfg['d'], layer_cfg['n'], \
+              layer_cfg['s'], layer_cfg['g'], layer_cfg['p'], layer_cfg['f'], \
+              layer_cfg['a']
+        self.input_size = input_size
+        self.nglo = nglo
+        assert input_size%patch_size == 0, "Input size is not divided by patch size in ViTHead!"
+        assert nglo == 0, "Number of global tokens in ViTHead is not 0!"
+        nx = self.input_size // patch_size
+        ny = self.input_size // patch_size
+        seq_len = nx * ny + nglo
+
+        # patch embedding
+        layers = [
+            PatchEmbed(patch_size, nx, ny, in_chans=in_dim, embed_dim=dim,
+                       ape=ape, nglo=nglo, **self.patch_embed_args)
+        ]
+        for block_id in range(num_block):
+            layers.append(AttnBlock(
+                dim, num_heads, seq_len=seq_len, rpe=not ape,
+                wx=nx, wy=ny, nglo=nglo,
+                **self.attn_args
+            ))
+            layers.append(MlpBlock(dim,  **self.mlp_args))
+        self.layer4 = nn.Sequential(*layers)
+        self.norm = norm_layer(dim)
+        self.out_channels = dim
+
+    def forward(self, x):
+        B, C, nx, ny = x.shape
+        assert nx == ny == self.input_size, "Input size does not match the initialized size in ViThead!"
+        nglo = self.nglo
+        x, nx, ny = self.layer4((x, None, None))
+        x = self.norm(x)
+        x = x[:, nglo:].transpose(-2, -1).reshape(B, -1, nx, ny)
+        return x
--- a/maskrcnn_benchmark/modeling/backbone/slidingchunk_2d.py
+++ b/maskrcnn_benchmark/modeling/backbone/slidingchunk_2d.py
@ -0,0 +1,366 @@
+# Copyright (c) 2021 Microsoft Corporation. Licensed under the MIT license.
+# Written by Pengchuan Zhang, penzhan@microsoft.com
+from functools import lru_cache
+import torch
+from torch import einsum
+from torch.cuda.amp import autocast
+
+
+class SlidingChunk2D(torch.autograd.Function):
+    """
+    Class to encapsulate for sliding chunk implementation of vision longformer
+    """
+    mode_dict = {
+        1: (1, 1),  # -1, -1
+        2: (1, 0),  # -1, 0
+        3: (1, -1),  # -1, 1
+        4: (0, 1),  # 0, -1
+        5: (0, -1),  # 0, 1
+        6: (-1, 1),  # 1, -1
+        7: (-1, 0),  # 1, 0
+        8: (-1, -1),  # 1, 1
+    }
+
+    @staticmethod
+    def slidingchunk_qk(q_img: torch.Tensor, k_img: torch.Tensor, mode: int):
+        '''
+        q_img x k_img = attn11 ==> Useful for query x key = attention_scores
+        The cyclic padding strategy
+        q_img, k_img: (B * H, M, mx, my, W**2)
+        attn11： (B*H, mx, my, W**2, 9*W**2), mode=0
+                (B*H, mx, my, W**2, W**2), mode=-1
+                (B*H, mx, my, W**2, 2*W**2), mode=i>0
+        mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
+        '''
+        if mode == 0:
+            return torch.cat([
+                # -1, -1
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=(1, 1), dims=(2, 3))),
+                # -1, 0
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=1, dims=2)),
+                # -1, 1
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=(1, -1), dims=(2, 3))),
+                # 0, -1
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=1, dims=3)),
+                # 0, 0
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       k_img),
+                # 0, 1
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=-1, dims=3)),
+                # 1, -1
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=(-1, 1), dims=(2, 3))),
+                # 1, 0
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=-1, dims=2)),
+                # 1, 1
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=(-1, -1), dims=(2, 3))),
+            ], dim=-1)
+        elif mode == -1:
+            return einsum(
+                'b c m n l, b c m n t -> b m n l t', q_img, k_img
+            ) * 1.0
+        else:
+            shift = SlidingChunk2D.mode_dict[mode]
+            return torch.cat([
+                # 0, 0
+                einsum('b c m n l, b c m n t -> b m n l t', q_img, k_img),
+                # x, x
+                einsum('b c m n l, b c m n t -> b m n l t', q_img,
+                       torch.roll(k_img, shifts=shift, dims=(2, 3))),
+            ], dim=-1)
+
+
+    @staticmethod
+    def slidingchunk_av(attn: torch.Tensor, v_img: torch.Tensor, mode: int):
+        '''
+        attn x v_img = x ==> Useful for attn x value = context
+        The cyclic padding strategy
+        v_img, context: (B * H, M, mx, my, W**2)
+        attn： (B*H, mx, my, W**2, 9*W**2), mode=0
+                (B*H, mx, my, W**2, W**2), mode=-1
+                (B*H, mx, my, W**2, 2*W**2), mode=i>0
+        mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
+        '''
+        w2 = v_img.shape[-1]
+        if mode == 0:
+            attnn1n1, attnn10, attnn11, attn0n1, attn00, attn01, attn1n1, attn10, attn11 = torch.split(
+                attn, w2, dim=-1
+            )
+        elif mode == -1:
+            attn00 = attn
+        else:
+            attn00, attnxx = torch.split(
+                attn, w2, dim=-1
+            )
+        output = einsum('b m n l t, b c m n t -> b c m n l', attn00, v_img)  # 0,0
+
+        if mode == 0:
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn1n1,
+                                     torch.roll(v_img, shifts=(1, 1), dims=(2, 3)))  # -1,-1
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn10,
+                                     torch.roll(v_img, shifts=1, dims=2))  # -1,0
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn11,
+                                     torch.roll(v_img, shifts=(1, -1), dims=(2, 3)))  # -1,1
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attn0n1,
+                                     torch.roll(v_img, shifts=1, dims=3))  # 0,-1
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attn01,
+                                     torch.roll(v_img, shifts=-1, dims=3))  # 0,1
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attn1n1,
+                                     torch.roll(v_img, shifts=(-1, 1), dims=(2, 3)))  # 1,-1
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attn10,
+                                     torch.roll(v_img, shifts=-1, dims=2))  # 1,0
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attn11,
+                                     torch.roll(v_img, shifts=(-1, -1), dims=(2, 3)))  # 1,1
+        elif mode > 0:
+            shift = SlidingChunk2D.mode_dict[mode]
+            output = output + einsum('b m n l t, b c m n t -> b c m n l', attnxx,
+                                     torch.roll(v_img, shifts=shift, dims=(2, 3)))  # 1,1
+        else:
+            output = output * 1.0
+
+        return output
+
+    @staticmethod
+    def slidingchunk_agrad(attn: torch.Tensor, grad_x: torch.Tensor, mode: int):
+        '''
+        attn.t() x grad_x = grad_v ==> Useful for attn.t() x grad_x = grad_v
+        The cyclic padding strategy
+        grad_x, grad_v: (B * H, M, mx, my, W**2)
+        attn： (B*H, mx, my, W**2, 9*W**2), mode=0
+                (B*H, mx, my, W**2, W**2), mode=-1
+                (B*H, mx, my, W**2, 2*W**2), mode=i>0
+        mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
+        '''
+        w2 = grad_x.shape[-1]
+        if mode == 0:
+            attnn1n1, attnn10, attnn11, attn0n1, attn00, attn01, attn1n1, attn10, attn11 = torch.split(
+                attn, w2, dim=-1
+            )
+        elif mode == -1:
+            attn00 = attn
+        else:
+            attn00, attnxx = torch.split(
+                attn, w2, dim=-1
+            )
+
+        # 0,0
+        output = einsum('b m n l t, b c m n l -> b c m n t', attn00, grad_x)
+
+        if mode == 0:
+            # -1,-1
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attnn1n1, grad_x),
+                shifts=(-1, -1), dims=(2, 3))
+            # -1,0
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attnn10, grad_x),
+                shifts=-1, dims=2)
+            # -1,1
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attnn11, grad_x),
+                shifts=(-1, 1), dims=(2, 3))
+            # 0,-1
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attn0n1, grad_x),
+                shifts=-1, dims=3)
+            # 0,1
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attn01, grad_x),
+                shifts=1, dims=3)
+            # 1,-1
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attn1n1, grad_x),
+                shifts=(1, -1), dims=(2, 3))
+            # 1,0
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attn10, grad_x),
+                shifts=1, dims=2)
+            # 1,1
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attn11, grad_x),
+                shifts=(1, 1), dims=(2, 3))
+        elif mode > 0:
+            shift = SlidingChunk2D.mode_dict[mode]
+            shift = (-shift[0], -shift[1])
+            output = output + torch.roll(
+                einsum('b m n l t, b c m n l -> b c m n t', attnxx, grad_x),
+                shifts=shift, dims=(2, 3))
+        else:
+            output = output * 1.0
+
+        return output
+
+    @staticmethod
+    @autocast()  # comment this out if AMP is not used
+    def forward(ctx, t1: torch.Tensor, t2: torch.Tensor,
+                is_t1_diagonaled: bool = False, mode: int = 0) -> torch.Tensor:
+        """Compuates sliding chunk mm of t1 and t2.
+        args:
+        t1: torch.Tensor = (B * H, M, mx, my, W**2) if is_t1_diagonaled = false,
+                         = (B*H, mx, my, W**2, 9*W**2) if is_t1_diagonaled = true, mode=0.
+                         = (B*H, mx, my, W**2, W**2) if is_t1_diagonaled = true, mode=-1.
+                         = (B*H, mx, my, W**2, 2*W**2) if is_t1_diagonaled = true, mode=i>0.
+        t2: torch.Tensor = (B * H, M, mx, my, W**2). This is always a
+            non-diagonaled tensor, e.g. `key_layer` or `value_layer`
+        is_t1_diagonaled: is t1 a diagonaled or a regular tensor
+        mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
+        returns:
+        is_t1_diagonaled = true:
+        torch.Tensor = (B * H, M, mx, my, W**2)
+        mode=0, is_t1_diagonaled = false:
+        torch.Tensor = (B*H, mx, my, W**2, 9*W**2)
+        mode=-1, is_t1_diagonaled = false:
+        torch.Tensor = (B*H, mx, my, W**2, W**2)
+        mode=i>0, is_t1_diagonaled = false:
+        torch.Tensor = (B*H, mx, my, W**2, W**2)
+        """
+        ctx.save_for_backward(t1, t2)
+        ctx.is_t1_diagonaled = is_t1_diagonaled
+        ctx.mode = mode
+        if is_t1_diagonaled:
+            return SlidingChunk2D.slidingchunk_av(t1, t2, mode)
+        else:
+            return SlidingChunk2D.slidingchunk_qk(t1, t2, mode)
+
+    @staticmethod
+    @autocast()  # comment this out if AMP is not used
+    def backward(ctx, grad_output):
+        t1, t2 = ctx.saved_tensors
+        is_t1_diagonaled = ctx.is_t1_diagonaled
+        mode = ctx.mode
+        if is_t1_diagonaled:
+            grad_t1 = SlidingChunk2D.slidingchunk_qk(grad_output, t2, mode)
+            grad_t2 = SlidingChunk2D.slidingchunk_agrad(t1, grad_output, mode)
+        else:
+            grad_t1 = SlidingChunk2D.slidingchunk_av(grad_output, t2, mode)
+            grad_t2 = SlidingChunk2D.slidingchunk_agrad(grad_output, t1, mode)
+        return grad_t1, grad_t2, None, None
+
+
+@lru_cache()
+def _get_invalid_locations_mask_cyclic(nx: int, ny: int, padx: int, pady: int,
+                                w: int, device: str):
+    w2 = w ** 2
+    mask = torch.BoolTensor([
+        [
+            (i // ny + (j // w2) // 3 == nx and
+             (nx - 1) * w + (j % w2) // w >= nx * w - padx) or
+            (i % ny + (j // w2) % 3 == ny and
+             (ny - 1) * w + (j % w2) % w >= ny * w - pady)
+            for j in range(9 * w2)
+        ]
+        for i in range(nx * ny)
+    ], device='cpu')
+
+    # We should count the w2 in the query here
+    num_invalid = w2 * mask.sum()
+
+    return mask.to(device), num_invalid.to(device)
+
+
+@lru_cache()
+def _get_invalid_locations_mask_zero(nx: int, ny: int, padx: int, pady: int,
+                                w: int, device: str):
+    w2 = w ** 2
+    mask = torch.BoolTensor([
+        [
+            i // ny + (j // w2) // 3 - 1 < 0 or
+            i // ny + (j // w2) // 3 - 1 >= nx or
+            (i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w >= nx * w - padx or
+            i % ny + (j // w2) % 3 - 1 < 0 or
+            i % ny + (j // w2) % 3 - 1 >= ny or
+            (i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w >= ny * w - pady
+            for j in range(9 * w2)
+        ]
+        for i in range(nx * ny)
+    ], device='cpu')
+
+    # We should count the w2 in the query here
+    num_invalid = w2 * mask.sum()
+
+    return mask.to(device), num_invalid.to(device)
+
+
+@lru_cache()
+def _get_invalid_locations_mask_exact(nx: int, ny: int, padx: int, pady: int,
+                                      w: int, device: str):
+    w2 = w ** 2
+    nx_max = nx * w - 1 - padx
+    ny_max = ny * w - 1 - pady
+    mask = torch.BoolTensor([
+        [
+            [
+                (i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w < max(0, (
+                        i // ny - 1) * w + l // w) or
+                (i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w > min(
+                    nx_max, (i // ny + 1) * w + l // w) or
+                (i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w < max(0, (
+                        i % ny - 1) * w + l % w) or
+                (i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w > min(
+                    ny_max, (i % ny + 1) * w + l % w)
+                for j in range(9 * w2)
+            ]
+            for l in range(w2)
+        ]
+        for i in range(nx * ny)
+    ], device='cpu')
+    num_invalid = mask.sum()
+
+    return mask.to(device), num_invalid.to(device)
+
+
+def mask_invalid_locations(input_tensor: torch.Tensor, nx: int, ny: int,
+                           padx: int, pady: int, w: int,
+                           exact: int, mode: int = 0) -> torch.Tensor:
+    """exact
+    1: exact sliding window
+    0: blockwise sliding chunk with zero padding
+    -1: blockwise sliding chunk with cyclic padding
+    mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
+    """
+    w2 = w ** 2
+    if exact == 1 and mode == 0:
+        mask, num_invalid = _get_invalid_locations_mask_exact(
+            nx, ny, padx, pady, w, input_tensor.device)
+        mask = mask.view(1, nx, ny, w2, -1).expand(input_tensor.size())
+    else:
+        if exact == 0:
+            mask, num_invalid = _get_invalid_locations_mask_zero(
+                nx, ny, padx, pady, w, input_tensor.device)
+        elif exact == -1:
+            mask, num_invalid = _get_invalid_locations_mask_cyclic(
+                nx, ny, padx, pady, w, input_tensor.device)
+        else:
+            raise ValueError("longsc exact should be in [0,1,-1]!")
+        if mode == -1:
+            mask = mask[:, 4 * w2:5 * w2]
+            num_invalid = w2 * mask.sum()
+        elif mode > 0:
+            chunk_id = mode if mode > 4 else mode - 1
+            mask = torch.cat([
+                mask[:, 4 * w2:5 * w2],
+                mask[:, chunk_id * w2:(chunk_id+1) * w2],
+            ], dim=-1)
+            num_invalid = w2 * mask.sum()
+        mask = mask.view(1, nx, ny, 1, -1).expand(input_tensor.size())
+    input_tensor.masked_fill_(mask, -float('inf'))
+
+    return num_invalid
+
+
+def slidingchunk_2dautograd(t1: torch.Tensor, t2: torch.Tensor,
+                is_t1_diagonaled: bool = False, mode: int = 0) -> torch.Tensor:
+    if is_t1_diagonaled:
+        return SlidingChunk2D.slidingchunk_av(t1, t2, mode)
+    else:
+        return SlidingChunk2D.slidingchunk_qk(t1, t2, mode)
+
+
+slidingchunk_2d = SlidingChunk2D.apply
--- a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
+++ b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py
@ -35,8 +35,8 @@ class BalancedPositiveNegativeSampler(object):
        pos_idx = []
        neg_idx = []
        for matched_idxs_per_image in matched_idxs:
-            positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
-            negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
+            positive = torch.nonzero(matched_idxs_per_image >= 1, as_tuple=False).squeeze(1)
+            negative = torch.nonzero(matched_idxs_per_image == 0, as_tuple=False).squeeze(1)

            num_pos = int(self.batch_size_per_image * self.positive_fraction)
            # protect against not enough positive examples
--- a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
+++ b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py
@ -6,6 +6,7 @@ Implements the Generalized R-CNN framework
 import torch
 from torch import nn

+from maskrcnn_benchmark.structures.bounding_box import BoxList
 from maskrcnn_benchmark.structures.image_list import to_image_list

 from ..backbone import build_backbone
@ -29,6 +30,7 @@ class GeneralizedRCNN(nn.Module):
        self.backbone = build_backbone(cfg)
        self.rpn = build_rpn(cfg, self.backbone.out_channels)
        self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)
+        self.force_boxes = cfg.MODEL.RPN.FORCE_BOXES

    def forward(self, images, targets=None):
        """
@ -45,9 +47,30 @@ class GeneralizedRCNN(nn.Module):
        """
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
+        if self.force_boxes and targets is None:
+            # note targets cannot be None but could have 0 box.
+            raise ValueError("In force_boxes setting, targets should be passed")
        images = to_image_list(images)
        features = self.backbone(images.tensors)
-        proposals, proposal_losses = self.rpn(images, features, targets)
+
+        if targets:
+            targets = [target.to(self.device)
+                       for target in targets if target is not None]
+
+        if self.force_boxes:
+            proposals = [BoxList(target.bbox, target.size, target.mode)
+                         for target in targets]
+            if self.training:
+                # note we still need to compute a loss using all rpn
+                # named parameters, otherwise it will
+                # give unused_parameters error in distributed training.
+                null_loss = 0
+                for key, param in self.rpn.named_parameters():
+                    null_loss += 0.0 * param.sum()
+                proposal_losses = {'rpn_null_loss', null_loss}
+        else:
+            proposals, proposal_losses = self.rpn(images, features, targets)
+
        if self.roi_heads:
            x, result, detector_losses = self.roi_heads(features, proposals, targets)
        else:
--- a/maskrcnn_benchmark/modeling/matcher.py
+++ b/maskrcnn_benchmark/modeling/matcher.py
@ -101,7 +101,7 @@ class Matcher(object):
        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
        # Find highest quality match available, even if it is low, including ties
        gt_pred_pairs_of_highest_quality = torch.nonzero(
-            match_quality_matrix == highest_quality_foreach_gt[:, None]
+            match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=False
        )
        # Example gt_pred_pairs_of_highest_quality:
        #   tensor([[    0, 39796],
--- a/maskrcnn_benchmark/modeling/poolers.py
+++ b/maskrcnn_benchmark/modeling/poolers.py
@ -114,7 +114,7 @@ class Pooler(nn.Module):
            device=device,
        )
        for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)):
-            idx_in_level = torch.nonzero(levels == level).squeeze(1)
+            idx_in_level = torch.nonzero(levels == level, as_tuple=False).squeeze(1)
            rois_per_level = rois[idx_in_level]
            result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype)

--- a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
@ -201,7 +201,7 @@ class PostProcessor(nn.Module):
        inds_all = scores > self.score_thresh
        boxlist_empty = self.prepare_empty_boxlist(boxlist)
        for j in range(1, num_classes):
-            inds = inds_all[:, j].nonzero().squeeze(1)
+            inds = inds_all[:, j].nonzero(as_tuple=False).squeeze(1)

            if len(inds)>0:
                scores_j = scores[inds, j]
@ -239,7 +239,7 @@ class PostProcessor(nn.Module):
                cls_scores.cpu(), number_of_detections - self.detections_per_img + 1
            )
            keep = cls_scores >= image_thresh.item()
-            keep = torch.nonzero(keep).squeeze(1)
+            keep = torch.nonzero(keep, as_tuple=False).squeeze(1)
            result = result[keep]
        return result

@ -273,7 +273,7 @@ class PostProcessor(nn.Module):

        # filter duplicate boxes
        scores_pre, labels_pre = dists_all.max(1)
-        inds_pre = scores_pre.nonzero()
+        inds_pre = scores_pre.nonzero(as_tuple=False)
        assert inds_pre.dim() != 0
        inds_pre = inds_pre.squeeze(1)

@ -331,7 +331,7 @@ class PostProcessor(nn.Module):
        hs = (y2 - y1).squeeze(1)
        keep = (
            (ws >= 0) & (hs >= 0) & (scores > self.score_thresh * 0.01)
-        ).nonzero().squeeze(1)
+        ).nonzero(as_tuple=False).squeeze(1)
        del ws, hs

        # apply nms to the previous low-thresholded results
--- a/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py
@ -118,7 +118,7 @@ class FastRCNNLossComputation(object):
        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
            zip(sampled_pos_inds, sampled_neg_inds)
        ):
-            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
+            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img, as_tuple=False).squeeze(1)
            proposals_per_image = proposals[img_idx][img_sampled_inds]
            proposals[img_idx] = proposals_per_image

@ -182,7 +182,7 @@ class FastRCNNLossComputation(object):
        # get indices that correspond to the regression targets for
        # the corresponding ground truth labels, to be used with
        # advanced indexing
-        sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)
+        sampled_pos_inds_subset = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
        labels_pos = labels[sampled_pos_inds_subset]
        if self.cls_agnostic_bbox_reg:
            map_inds = torch.tensor([4, 5, 6, 7], device=device)
--- a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py
@ -9,6 +9,7 @@ from maskrcnn_benchmark.modeling.backbone import resnet
 from maskrcnn_benchmark.modeling.poolers import Pooler
 from maskrcnn_benchmark.modeling.make_layers import group_norm
 from maskrcnn_benchmark.modeling.make_layers import make_fc
+from maskrcnn_benchmark.modeling.backbone.msvit import ViTHead


@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor")
@ -158,6 +159,41 @@ class FPNXconv1fcFeatureExtractor(nn.Module):
        return x


+@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ViTHeadFeatureExtractor")
+class ViTHeadFeatureExtractor(nn.Module):
+    def __init__(self, config, in_channels):
+        super(ViTHeadFeatureExtractor, self).__init__()
+
+        resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES
+        sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler = Pooler(
+            output_size=(resolution, resolution),
+            scales=scales,
+            sampling_ratio=sampling_ratio,
+        )
+
+        # VIT head
+        args = dict(
+            input_size=config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION,
+            drop_rate=config.MODEL.TRANSFORMER.DROP,
+            drop_path_rate=config.MODEL.TRANSFORMER.DROP_PATH,
+            norm_embed=config.MODEL.TRANSFORMER.NORM_EMBED,
+            layer_cfgstr=config.MODEL.TRANSFORMER.VITHEADARCH,
+            ln_eps=config.MODEL.TRANSFORMER.MSVIT.LN_EPS,
+        )
+        head = ViTHead(in_dim=in_channels, **args)
+
+        self.pooler = pooler
+        self.head = head
+        self.out_channels = head.out_channels
+
+    def forward(self, x, proposals):
+        x = self.pooler(x, proposals)
+        x = self.head(x)
+        return x
+
+
 def make_roi_box_feature_extractor(cfg, in_channels):
    func = registry.ROI_BOX_FEATURE_EXTRACTORS[
        cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR
--- a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py
@ -135,7 +135,7 @@ class KeypointRCNNLossComputation(object):
        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
            zip(sampled_pos_inds, sampled_neg_inds)
        ):
-            img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1)
+            img_sampled_inds = torch.nonzero(pos_inds_img, as_tuple=False).squeeze(1)
            proposals_per_image = proposals[img_idx][img_sampled_inds]
            proposals[img_idx] = proposals_per_image

@ -155,7 +155,7 @@ class KeypointRCNNLossComputation(object):

        keypoint_targets = cat(heatmaps, dim=0)
        valid = cat(valid, dim=0).to(dtype=torch.bool)
-        valid = torch.nonzero(valid).squeeze(1)
+        valid = torch.nonzero(valid, as_tuple=False).squeeze(1)

        # torch.mean (in binary_cross_entropy_with_logits) does'nt
        # accept empty tensors, so handle it sepaartely
--- a/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py
@ -83,7 +83,7 @@ class MaskRCNNLossComputation(object):
            labels_per_image[neg_inds] = 0

            # mask scores are only computed on positive samples
-            positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1)
+            positive_inds = torch.nonzero(labels_per_image > 0, as_tuple=False).squeeze(1)

            segmentation_masks = matched_targets.get_field("masks")
            segmentation_masks = segmentation_masks[positive_inds]
@ -114,7 +114,7 @@ class MaskRCNNLossComputation(object):
        labels = cat(labels, dim=0)
        mask_targets = cat(mask_targets, dim=0)

-        positive_inds = torch.nonzero(labels > 0).squeeze(1)
+        positive_inds = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
        labels_pos = labels[positive_inds]

        # torch.mean (in binary_cross_entropy_with_logits) doesn't
--- a/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py
@ -27,7 +27,7 @@ def keep_only_positive_boxes(boxes):
    for boxes_per_image in boxes:
        labels = boxes_per_image.get_field("labels")
        inds_mask = labels > 0
-        inds = inds_mask.nonzero().squeeze(1)
+        inds = inds_mask.nonzero(as_tuple=False).squeeze(1)
        positive_boxes.append(boxes_per_image[inds])
        positive_inds.append(inds_mask)
    return positive_boxes, positive_inds
--- a/maskrcnn_benchmark/modeling/rpn/loss.py
+++ b/maskrcnn_benchmark/modeling/rpn/loss.py
@ -104,8 +104,8 @@ class RPNLossComputation(object):
        anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
        labels, regression_targets = self.prepare_targets(anchors, targets)
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
-        sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
-        sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
+        sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0), as_tuple=False).squeeze(1)
+        sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0), as_tuple=False).squeeze(1)

        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)

--- a/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py
@ -103,7 +103,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)

            per_candidate_nonzeros = \
-                    per_candidate_inds.nonzero()[top_k_indices, :]
+                    per_candidate_inds.nonzero(as_tuple=False)[top_k_indices, :]

            per_box_loc = per_candidate_nonzeros[:, 0]
            per_class = per_candidate_nonzeros[:, 1]
@ -138,7 +138,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
            result = []
            # skip the background
            for j in range(1, self.num_classes):
-                inds = (labels == j).nonzero().view(-1)
+                inds = (labels == j).nonzero(as_tuple=False).view(-1)

                scores_j = scores[inds]
                boxes_j = boxes[inds, :].view(-1, 4)
@ -167,7 +167,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
                    number_of_detections - self.fpn_post_nms_top_n + 1
                )
                keep = cls_scores >= image_thresh.item()
-                keep = torch.nonzero(keep).squeeze(1)
+                keep = torch.nonzero(keep, as_tuple=False).squeeze(1)
                result = result[keep]
            results.append(result)
        return results
--- a/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py
+++ b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py
@ -61,7 +61,7 @@ class RetinaNetLossComputation(RPNLossComputation):

        labels = torch.cat(labels, dim=0)
        regression_targets = torch.cat(regression_targets, dim=0)
-        pos_inds = torch.nonzero(labels > 0).squeeze(1)
+        pos_inds = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)

        retinanet_regression_loss = smooth_l1_loss(
            box_regression[pos_inds],
--- a/maskrcnn_benchmark/solver/init.py
+++ b/maskrcnn_benchmark/solver/init.py
@ -1,4 +1,4 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from .build import make_optimizer
+from .build import make_optimizer, make_optimizer_d2
 from .build import make_lr_scheduler
 from .lr_scheduler import WarmupMultiStepLR
--- a/maskrcnn_benchmark/solver/build.py
+++ b/maskrcnn_benchmark/solver/build.py
@ -1,4 +1,5 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import itertools
 import torch

 from .lr_scheduler import WarmupMultiStepLR
@ -20,6 +21,65 @@ def make_optimizer(cfg, model):
    return optimizer


+def make_optimizer_d2(cfg, model):
+    # default no decay parameters for resnets
+    no_decay = ['bn.bias', 'bn.weight', 'bn1.bias', 'bn1.weight',
+                'bn2.bias', 'bn2.weight', 'bn3.bias', 'bn3.weight']
+    if hasattr(model.backbone.body, 'no_weight_decay'):
+        no_decay = list(model.backbone.body.no_weight_decay())
+
+    params = []
+    memo = set()
+    for key, value in model.named_parameters(recurse=True):
+        if not value.requires_grad:
+            continue
+        # Avoid duplicating parameters
+        if value in memo:
+            continue
+        memo.add(value)
+        lr = cfg.SOLVER.BASE_LR
+        weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        if "bias" in key:
+            lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+
+        if any(nd in key for nd in no_decay):
+            weight_decay = 0.0
+
+        params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
+
+    def maybe_add_full_model_gradient_clipping(optim):  # optim: the optimizer class
+        # detectron2 doesn't have full model gradient clipping now
+        clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+        enable = (
+            cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+            and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+            and clip_norm_val > 0.0
+        )
+
+        class FullModelGradientClippingOptimizer(optim):
+            def step(self, closure=None):
+                all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+                torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                super().step(closure=closure)
+
+        return FullModelGradientClippingOptimizer if enable else optim
+
+    optimizer_type = cfg.SOLVER.OPTIMIZER
+    if optimizer_type == "SGD":
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+            params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
+        )
+    elif optimizer_type == "ADAMW":
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+            params, cfg.SOLVER.BASE_LR
+        )
+    else:
+        raise NotImplementedError(f"no optimizer type {optimizer_type}")
+
+    return optimizer
+
+
 def make_lr_scheduler(cfg, optimizer):
    return WarmupMultiStepLR(
        optimizer,
--- a/maskrcnn_benchmark/structures/boxlist_ops.py
+++ b/maskrcnn_benchmark/structures/boxlist_ops.py
@ -45,7 +45,7 @@ def remove_small_boxes(boxlist, min_size):
    _, _, ws, hs = xywh_boxes.unbind(dim=1)
    keep = (
        (ws >= min_size) & (hs >= min_size)
-    ).nonzero().squeeze(1)
+    ).nonzero(as_tuple=False).squeeze(1)
    return boxlist[keep]


--- a/maskrcnn_benchmark/structures/segmentation_mask.py
+++ b/maskrcnn_benchmark/structures/segmentation_mask.py
@ -457,7 +457,7 @@ class PolygonList(object):
            # advanced indexing on a single dimension
            selected_polygons = []
            if isinstance(item, torch.Tensor) and item.dtype == torch.bool:
-                item = item.nonzero()
+                item = item.nonzero(as_tuple=False)
                item = item.squeeze(1) if item.numel() > 0 else item
                item = item.tolist()
            for i in item:
--- a/maskrcnn_benchmark/utils/amp.py
+++ b/maskrcnn_benchmark/utils/amp.py
@ -0,0 +1,14 @@
+from contextlib import contextmanager
+
+@contextmanager
+def nullcontext(enter_result=None, **kwargs):
+    yield enter_result
+
+try:
+    from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd
+except:
+    print('[Warning] Library for automatic mixed precision is not found, AMP is disabled!!')
+    GradScaler = nullcontext
+    autocast = nullcontext
+    custom_fwd = nullcontext
+    custom_bwd = nullcontext
--- a/maskrcnn_benchmark/utils/model_serialization.py
+++ b/maskrcnn_benchmark/utils/model_serialization.py
@ -1,13 +1,38 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 from collections import OrderedDict
 import logging
+import math

 import torch

 from maskrcnn_benchmark.utils.imports import import_file


-def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
+def resize_pos_embed_1d(posemb, shape_new):
+    # Rescale the grid of position embeddings when loading from state_dict.
+    ntok_old = posemb.shape[1]
+    if ntok_old > 1:
+        ntok_new = shape_new[1]
+        posemb_grid = posemb.permute(0, 2, 1).unsqueeze(dim=-1)
+        posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=[ntok_new, 1], mode='bilinear')
+        posemb_grid = posemb_grid.squeeze(dim=-1).permute(0, 2, 1)
+        posemb = posemb_grid
+    return posemb
+
+
+def resize_pos_embed_2d(posemb, shape_new):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    ntok_new = shape_new[0]
+    gs_old = int(math.sqrt(len(posemb)))  # 2 * w - 1
+    gs_new = int(math.sqrt(ntok_new))  # 2 * w - 1
+    posemb_grid = posemb.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(gs_new * gs_new, -1)
+    return posemb_grid
+
+
+def align_and_update_state_dicts(model_state_dict, loaded_state_dict, skip_unmatched_layers=True):
    """
    Strategy: suppose that the models that we will create will have prefixes appended
    to each of its keys, for example due to an extra level of nesting that the original
@ -41,11 +66,47 @@ def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
    max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
    log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
    logger = logging.getLogger(__name__)
+    # print out no match
+    uninitialized_keys = [current_keys[idx_new] for idx_new, idx_old in enumerate(idxs.tolist()) if idx_old == -1]
+    logger.info("Parameters not initialized from checkpoint: {}\n".format(
+        ','.join(uninitialized_keys)
+    ))
    for idx_new, idx_old in enumerate(idxs.tolist()):
        if idx_old == -1:
            continue
        key = current_keys[idx_new]
        key_old = loaded_keys[idx_old]
+        if model_state_dict[key].shape != loaded_state_dict[
+            key_old].shape and skip_unmatched_layers:
+            if 'x_pos_embed' in key or 'y_pos_embed' in key:
+                shape_old = loaded_state_dict[key_old].shape
+                shape_new = model_state_dict[key].shape
+                new_val = resize_pos_embed_1d(loaded_state_dict[key_old],
+                                              shape_new)
+                if shape_new == new_val.shape:
+                    model_state_dict[key] = new_val
+                    logger.info("[RESIZE] {} {} -> {} {}".format(
+                        key_old, shape_old, key, shape_new))
+                else:
+                    logger.info("[WARNING]", "{} {} != {} {}, skip".format(
+                        key_old, new_val.shape, key, shape_new))
+            elif 'local_relative_position_bias_table' in key:
+                shape_old = loaded_state_dict[key_old].shape
+                shape_new = model_state_dict[key].shape
+                new_val = resize_pos_embed_2d(loaded_state_dict[key_old],
+                                              shape_new)
+                if shape_new == new_val.shape:
+                    model_state_dict[key] = new_val
+                    logger.info("[RESIZE] {} {} -> {} {}".format(
+                        key_old, shape_old, key, shape_new))
+                else:
+                    logger.info("[WARNING]", "{} {} != {} {}, skip".format(
+                        key_old, new_val.shape, key, shape_new))
+            else:
+                # if layer weights does not match in size, skip this layer
+                logger.info(
+                    "SKIPPING LAYER {} because of size mis-match".format(key))
+            continue
        model_state_dict[key] = loaded_state_dict[key_old]
        logger.info(
            log_str_template.format(
--- a/scene_graph_benchmark/AttrRCNN.py
+++ b/scene_graph_benchmark/AttrRCNN.py
@ -5,6 +5,7 @@ Implements the FRCNN with Attribute Head
 import numpy as np
 import torch

+from maskrcnn_benchmark.structures.bounding_box import BoxList
 from maskrcnn_benchmark.structures.image_list import to_image_list
 from maskrcnn_benchmark.modeling.detector.generalized_rcnn import \
    GeneralizedRCNN
@ -56,6 +57,9 @@ class AttrRCNN(GeneralizedRCNN):
        """
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
+        if self.force_boxes and targets is None:
+            # note targets cannot be None but could have 0 box.
+            raise ValueError("In force_boxes setting, targets should be passed")

        images = to_image_list(images)
        images = images.to(self.device)
@ -65,7 +69,20 @@ class AttrRCNN(GeneralizedRCNN):
            targets = [target.to(self.device)
                       for target in targets if target is not None]

-        proposals, proposal_losses = self.rpn(images, features, targets)
+        if self.force_boxes:
+            proposals = [BoxList(target.bbox, target.size, target.mode)
+                         for target in targets]
+            if self.training:
+                # note we still need to compute a loss using all rpn
+                # named parameters, otherwise it will
+                # give unused_parameters error in distributed training.
+                null_loss = 0
+                for key, param in self.rpn.named_parameters():
+                    null_loss += 0.0 * param.sum()
+                proposal_losses = {'rpn_null_loss', null_loss}
+        else:
+            proposals, proposal_losses = self.rpn(images, features, targets)
+
        x, predictions, detector_losses = self.roi_heads(features,
                                                         proposals, targets)

--- a/scene_graph_benchmark/attribute_head/loss.py
+++ b/scene_graph_benchmark/attribute_head/loss.py
@ -48,9 +48,9 @@ class AttributeRCNNLossComputation(object):
        # prepare attribute targets
        sim_attributes = attribute_logits.new(attribute_logits.size()).zero_()
        for i in range(len(attributes)):
-            if len(torch.nonzero(attributes[i])) > 0:
-                sim_attributes[i][attributes[i][torch.nonzero(attributes[i])].long()] = 1.0 / len(
-                    torch.nonzero(attributes[i]))
+            if len(torch.nonzero(attributes[i], as_tuple=False)) > 0:
+                sim_attributes[i][attributes[i][torch.nonzero(attributes[i], as_tuple=False)].long()] = 1.0 / len(
+                    torch.nonzero(attributes[i], as_tuple=False))
        # TODO: do we need to ignore the all zero vector?
        attribute_loss = self.cross_entropy(attribute_logits, sim_attributes, loss_type="softmax")

--- a/scene_graph_benchmark/attribute_head/roi_attribute_feature_extractors.py
+++ b/scene_graph_benchmark/attribute_head/roi_attribute_feature_extractors.py
@ -15,6 +15,10 @@ registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS.register(
    "FPNXconv1fcFeatureExtractor", FPNXconv1fcFeatureExtractor
 )

+registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS.register(
+    "ViTHeadFeatureExtractor", ViTHeadFeatureExtractor
+)
+

 def make_roi_attribute_feature_extractor(cfg, in_channels):
    func = registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS[
--- a/scene_graph_benchmark/relation_head/balanced_positive_negative_pair_sampler.py
+++ b/scene_graph_benchmark/relation_head/balanced_positive_negative_pair_sampler.py
@ -34,8 +34,8 @@ class BalancedPositiveNegativePairSampler(object):
        pos_idx = []
        neg_idx = []
        for matched_idxs_per_image in matched_idxs:
-            positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
-            negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
+            positive = torch.nonzero(matched_idxs_per_image >= 1, as_tuple=False).squeeze(1)
+            negative = torch.nonzero(matched_idxs_per_image == 0, as_tuple=False).squeeze(1)

            num_pos = int(self.batch_size_per_image * self.positive_fraction)
            # protect against not enough positive examples
--- a/scene_graph_benchmark/relation_head/loss.py
+++ b/scene_graph_benchmark/relation_head/loss.py
@ -55,7 +55,7 @@ class FastRCNNLossComputation(object):
                match_j = match_quality_matrix[j].view(1, -1)
                match_ij = ((match_i + match_j) / 2)
                # rmeove duplicate index
-                non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero().view(-1).to(match_ij.device)
+                non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero(as_tuple=False).view(-1).to(match_ij.device)
                match_ij = match_ij.view(-1) # [::match_quality_matrix.shape[1]] = 0
                match_ij = match_ij[non_duplicate_idx]
                temp.append(match_ij)
@ -79,7 +79,7 @@ class FastRCNNLossComputation(object):
        idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposal.bbox.device)
        proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)

-        non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero()
+        non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False)
        proposal_box_pairs = proposal_box_pairs[non_duplicate_idx.view(-1)]
        proposal_idx_pairs = proposal_idx_pairs[non_duplicate_idx.view(-1)]
        proposal_pairs = BoxPairList(proposal_box_pairs, proposal.size, proposal.mode)
@ -167,7 +167,7 @@ class FastRCNNLossComputation(object):
        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
                zip(sampled_pos_inds, sampled_neg_inds)
        ):
-            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
+            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img, as_tuple=False).squeeze(1)
            proposal_pairs_per_image = proposal_pairs[img_idx][img_sampled_inds]
            proposal_pairs[img_idx] = proposal_pairs_per_image

@ -245,13 +245,13 @@ class FastRCNNLossComputation(object):
        idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals[0].bbox.device)
        proposal_idx_pairs_per_image = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)

-        keep_idx = (proposal_idx_pairs_per_image[:, 0] != proposal_idx_pairs_per_image[:, 1]).nonzero().view(-1)
+        keep_idx = (proposal_idx_pairs_per_image[:, 0] != proposal_idx_pairs_per_image[:, 1]).nonzero(as_tuple=False).view(-1)

        # if we filter non overlap bounding boxes
        if cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
            ious = boxlist_iou(proposals[0], proposals[0]).view(-1)
            ious = ious[keep_idx]
-            keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
+            keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
        # proposal_idx_pairs_per_image = proposal_idx_pairs_per_image[keep_idx]
        proposal_box_pairs_per_image = proposal_box_pairs_per_image[keep_idx]
        proposal_box_pairs.append(proposal_box_pairs_per_image)
@ -361,7 +361,7 @@ class FastRCNNLossComputation(object):
        labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)

        # import pdb; pdb.set_trace()
-        rel_fg_cnt = len(labels.nonzero())
+        rel_fg_cnt = len(labels.nonzero(as_tuple=False))
        rel_bg_cnt = labels.shape[0] - rel_fg_cnt
        ce_weights = labels.new(class_logits.size(1)).fill_(1).float()
        ce_weights[0] = float(rel_fg_cnt) / (rel_bg_cnt + 1e-5)
--- a/scene_graph_benchmark/relation_head/msdn/msdn_base.py
+++ b/scene_graph_benchmark/relation_head/msdn/msdn_base.py
@ -113,7 +113,7 @@ class MSDN_BASE(nn.Module):
                            requires_grad=True).type_as(target_features)
            feature_data.append(temp)
        else:
-            transfer_list = (select_mat.data > 0).nonzero()
+            transfer_list = (select_mat.data > 0).nonzero(as_tuple=False)
            source_indices = Variable(transfer_list[:, 1])
            target_indices = Variable(transfer_list[:, 0])
            source_f = torch.index_select(source_features, 0, source_indices)
@ -122,7 +122,7 @@ class MSDN_BASE(nn.Module):

            for f_id in range(target_features.size()[0]):
                if select_mat[f_id, :].data.sum() > 0:
-                    feature_indices = (transfer_list[:, 0] == f_id).nonzero()[0]
+                    feature_indices = (transfer_list[:, 0] == f_id).nonzero(as_tuple=False)[0]
                    indices = Variable(feature_indices)
                    features = torch.index_select(transferred_features, 0,
                                                  indices).mean(0).view(-1)
--- a/scene_graph_benchmark/relation_head/pair_matcher.py
+++ b/scene_graph_benchmark/relation_head/pair_matcher.py
@ -93,7 +93,7 @@ class PairMatcher(object):
        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
        # Find highest quality match available, even if it is low, including ties
        gt_pred_pairs_of_highest_quality = torch.nonzero(
-            match_quality_matrix == highest_quality_foreach_gt[:, None]
+            match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=False
        )
        # Example gt_pred_pairs_of_highest_quality:
        #   tensor([[    0, 39796],
--- a/scene_graph_benchmark/relation_head/relation_head.py
+++ b/scene_graph_benchmark/relation_head/relation_head.py
@ -98,13 +98,13 @@ class ROIRelationHead(torch.nn.Module):
            proposal_label_pairs = torch.cat(
                (label_subj.view(-1, 1), label_obj.view(-1, 1)), 1)

-            keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
+            keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False).view(-1)

            # if we filter non overlap bounding boxes
            if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
                ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
                ious = ious[keep_idx]
-                keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
+                keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
            proposal_idx_pairs = proposal_idx_pairs[keep_idx]
            proposal_box_pairs = proposal_box_pairs[keep_idx]
            proposal_label_pairs = proposal_label_pairs[keep_idx]
--- a/scene_graph_benchmark/relation_head/relpn/relpn.py
+++ b/scene_graph_benchmark/relation_head/relpn/relpn.py
@ -46,7 +46,7 @@ class RelPN(nn.Module):
                match_ij = ((match_i + match_j) / 2)
                # rmeove duplicate index
                match_ij = match_ij.view(-1) # [::match_quality_matrix.shape[1]] = 0
-                # non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero().view(-1).to(match_ij.device)
+                # non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero(as_tuple=False).view(-1).to(match_ij.device)
                # match_ij = match_ij[non_duplicate_idx]
                temp.append(match_ij)
                boxi = target.bbox[i]; boxj = target.bbox[j]
@ -68,7 +68,7 @@ class RelPN(nn.Module):
        idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposal.bbox.device)
        proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)

-        # non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero()
+        # non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False)
        # proposal_box_pairs = proposal_box_pairs[non_duplicate_idx.view(-1)]
        # proposal_idx_pairs = proposal_idx_pairs[non_duplicate_idx.view(-1)]

@ -184,13 +184,13 @@ class RelPN(nn.Module):
            idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals_per_image.bbox.device)
            proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)

-            keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
+            keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False).view(-1)

            # if we filter non overlap bounding boxes
            if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
                ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
                ious = ious[keep_idx]
-                keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
+                keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
            proposal_idx_pairs = proposal_idx_pairs[keep_idx]
            proposal_box_pairs = proposal_box_pairs[keep_idx]
            proposal_pairs_per_image = BoxPairList(proposal_box_pairs, proposals_per_image.size, proposals_per_image.mode)
@ -212,11 +212,11 @@ class RelPN(nn.Module):
            obj_logits = proposals_per_image.get_field('scores_all')
            obj_bboxes = proposals_per_image.bbox
            relness = self.relationshipness(obj_logits, obj_bboxes, proposals_per_image.size)
-            keep_idx = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1).nonzero().view(-1)
+            keep_idx = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1).nonzero(as_tuple=False).view(-1)
            if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
                ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
                ious = ious[keep_idx]
-                keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
+                keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
            relness = relness.view(-1)[keep_idx]
            relness_sorted, order = torch.sort(relness.view(-1), descending=True)

@ -266,7 +266,7 @@ class RelPN(nn.Module):
        proposals = self._proposal_pairs
        labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)

-        rel_fg_cnt = len(labels.nonzero())
+        rel_fg_cnt = len(labels.nonzero(as_tuple=False))
        rel_bg_cnt = labels.shape[0] - rel_fg_cnt
        ce_weights = labels.new(class_logits.size(1)).fill_(1).float()
        ce_weights[0] = float(rel_fg_cnt) / (rel_bg_cnt + 1e-5)
--- a/sgg_configs/detector/R152FPN_test.yaml
+++ b/sgg_configs/detector/R152FPN_test.yaml
@ -1,6 +1,6 @@
 MODEL:
    META_ARCHITECTURE: "GeneralizedRCNN"
-    WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
+    WEIGHT: "pretrained_models/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
    BACKBONE:
      CONV_BODY: "R-152-FPN"
    RESNETS: 
--- a/sgg_configs/vgattr/vil_base.yaml
+++ b/sgg_configs/vgattr/vil_base.yaml
@ -0,0 +1,70 @@
+MODEL:
+    META_ARCHITECTURE: "AttrRCNN"
+    WEIGHT: "/mnt/model_storage/msvit/IN22kpretrained/deepbase_relative/model_best.pth"
+    BACKBONE:
+      CONV_BODY: "ViL-C4"
+    TRANSFORMER:
+      DROP: 0.0
+      DROP_PATH: 0.3
+      NORM_EMBED: True
+      OUT_FEATURES: ["layer3"]
+      VITHEADARCH: "l4,h12,d768,n1,s0,g0,p2,f7,a0"
+      MSVIT:
+        ARCH: "l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n8,s1,g1,p2,f7,a0_l3,h6,d384,n24,s1,g1,p2,f7,a0"
+        ATTN_TYPE: longformerhand
+        ONLY_GLOBAL: False
+        SHARE_KV: True
+        SHARE_W: True
+        SW_EXACT: 0
+    RPN:
+      PRE_NMS_TOP_N_TEST: 6000
+      POST_NMS_TOP_N_TEST: 300
+    ROI_HEADS:
+      BATCH_SIZE_PER_IMAGE: 384 # 512
+      POSITIVE_FRACTION: 0.5 # 0.25
+      SCORE_THRESH: 0.05 # 0.0001
+      DETECTIONS_PER_IMG: 100 # 600
+      MIN_DETECTIONS_PER_IMG: 10
+    ROI_BOX_HEAD:
+      NUM_CLASSES: 1595
+      FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
+    ROI_ATTRIBUTE_HEAD:
+      NUM_ATTRIBUTES: 525
+      POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
+      FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
+    ATTRIBUTE_ON: False
+INPUT:
+    MIN_SIZE_TEST: 600
+    MAX_SIZE_TEST: 1000
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+DATASETS:
+    TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
+    TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
+    FACTORY_TRAIN: ("VGTSVDataset",)
+    FACTORY_TEST: ("VGTSVDataset",)
+DATALOADER:
+    NUM_WORKERS: 0
+SOLVER:
+    BASE_LR: 0.00008
+    WEIGHT_DECAY: 0.05
+    STEPS: (75000, 100000)
+    MAX_ITER: 170000
+    IMS_PER_BATCH: 1
+    CHECKPOINT_PERIOD: 5000
+    OPTIMIZER: "ADAMW"
+    CLIP_GRADIENTS:
+      ENABLED: True
+      CLIP_TYPE: "full_model"
+      CLIP_VALUE: 1.0
+      NORM_TYPE: 2.0
+TEST:
+    IMS_PER_BATCH: 1
+    SKIP_PERFORMANCE_EVAL: False
+    SAVE_PREDICTIONS: True
+    SAVE_RESULTS_TO_TSV: True
+    TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
+    GATHER_ON_CPU: False
+OUTPUT_DIR: "./output/vilc4_test"
+DATA_DIR: "./datasets"
+DISTRIBUTED_BACKEND: 'nccl'
--- a/sgg_configs/vgattr/vil_large.yaml
+++ b/sgg_configs/vgattr/vil_large.yaml
@ -0,0 +1,70 @@
+MODEL:
+    META_ARCHITECTURE: "AttrRCNN"
+    WEIGHT: "/mnt/model_storage/msvit/IN22kpretrained/villarge_relative/model_best.pth"
+    BACKBONE:
+      CONV_BODY: "ViL-C4"
+    TRANSFORMER:
+      DROP: 0.0
+      DROP_PATH: 0.5
+      NORM_EMBED: True
+      OUT_FEATURES: ["layer3"]
+      VITHEADARCH: "l4,h24,d1536,n1,s0,g0,p2,f7,a0"
+      MSVIT:
+        ARCH: "l1,h3,d192,n1,s1,g1,p4,f7,a0_l2,h6,d384,n8,s1,g1,p2,f7,a0_l3,h12,d768,n24,s1,g1,p2,f7,a0"
+        ATTN_TYPE: longformerhand
+        ONLY_GLOBAL: False
+        SHARE_KV: True
+        SHARE_W: True
+        SW_EXACT: 0
+    RPN:
+      PRE_NMS_TOP_N_TEST: 6000
+      POST_NMS_TOP_N_TEST: 300
+    ROI_HEADS:
+      BATCH_SIZE_PER_IMAGE: 384 # 512
+      POSITIVE_FRACTION: 0.5 # 0.25
+      SCORE_THRESH: 0.05 # 0.0001
+      DETECTIONS_PER_IMG: 100 # 600
+      MIN_DETECTIONS_PER_IMG: 10
+    ROI_BOX_HEAD:
+      NUM_CLASSES: 1595
+      FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
+    ROI_ATTRIBUTE_HEAD:
+      NUM_ATTRIBUTES: 525
+      POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
+      FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
+    ATTRIBUTE_ON: False
+INPUT:
+    MIN_SIZE_TEST: 600
+    MAX_SIZE_TEST: 1000
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+DATASETS:
+    TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
+    TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
+    FACTORY_TRAIN: ("VGTSVDataset",)
+    FACTORY_TEST: ("VGTSVDataset",)
+DATALOADER:
+    NUM_WORKERS: 0
+SOLVER:
+    BASE_LR: 0.00008
+    WEIGHT_DECAY: 0.05
+    STEPS: (75000, 100000)
+    MAX_ITER: 170000
+    IMS_PER_BATCH: 1
+    CHECKPOINT_PERIOD: 5000
+    OPTIMIZER: "ADAMW"
+    CLIP_GRADIENTS:
+      ENABLED: True
+      CLIP_TYPE: "full_model"
+      CLIP_VALUE: 1.0
+      NORM_TYPE: 2.0
+TEST:
+    IMS_PER_BATCH: 1
+    SKIP_PERFORMANCE_EVAL: False
+    SAVE_PREDICTIONS: True
+    SAVE_RESULTS_TO_TSV: True
+    TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
+    GATHER_ON_CPU: False
+OUTPUT_DIR: "./output/vilc4_test"
+DATA_DIR: "./datasets"
+DISTRIBUTED_BACKEND: 'nccl'
--- a/sgg_configs/vgattr/vil_small.yaml
+++ b/sgg_configs/vgattr/vil_small.yaml
@ -0,0 +1,70 @@
+MODEL:
+    META_ARCHITECTURE: "AttrRCNN"
+    WEIGHT: "/mnt/model_storage/msvit/visionlongformer/longtiny1191_ape0_exact0_nglo1_mode1_swith075/model_best.pth"
+    BACKBONE:
+      CONV_BODY: "ViL-C4"
+    TRANSFORMER:
+      DROP: 0.0
+      DROP_PATH: 0.1
+      NORM_EMBED: True
+      OUT_FEATURES: ["layer3"]
+      VITHEADARCH: "l4,h12,d768,n1,s0,g0,p2,f7,a0"
+      MSVIT:
+        ARCH: "l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n2,s1,g1,p2,f7,a0_l3,h6,d384,n8,s1,g1,p2,f7,a0"
+        ATTN_TYPE: longformerhand
+        ONLY_GLOBAL: False
+        SHARE_KV: True
+        SHARE_W: True
+        SW_EXACT: 0
+    RPN:
+      PRE_NMS_TOP_N_TEST: 6000
+      POST_NMS_TOP_N_TEST: 300
+    ROI_HEADS:
+      BATCH_SIZE_PER_IMAGE: 384 # 512
+      POSITIVE_FRACTION: 0.5 # 0.25
+      SCORE_THRESH: 0.05 # 0.0001
+      DETECTIONS_PER_IMG: 100 # 600
+      MIN_DETECTIONS_PER_IMG: 10
+    ROI_BOX_HEAD:
+      NUM_CLASSES: 1595
+      FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
+    ROI_ATTRIBUTE_HEAD:
+      NUM_ATTRIBUTES: 525
+      POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
+      FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
+    ATTRIBUTE_ON: False
+INPUT:
+    MIN_SIZE_TEST: 600
+    MAX_SIZE_TEST: 1000
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+DATASETS:
+    TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
+    TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
+    FACTORY_TRAIN: ("VGTSVDataset",)
+    FACTORY_TEST: ("VGTSVDataset",)
+DATALOADER:
+    NUM_WORKERS: 0
+SOLVER:
+    BASE_LR: 0.0001
+    WEIGHT_DECAY: 0.05
+    STEPS: (75000, 100000)
+    MAX_ITER: 170000
+    IMS_PER_BATCH: 1
+    CHECKPOINT_PERIOD: 5000
+    OPTIMIZER: "ADAMW"
+    CLIP_GRADIENTS:
+      ENABLED: True
+      CLIP_TYPE: "full_model"
+      CLIP_VALUE: 1.0
+      NORM_TYPE: 2.0
+TEST:
+    IMS_PER_BATCH: 1
+    SKIP_PERFORMANCE_EVAL: False
+    SAVE_PREDICTIONS: True
+    SAVE_RESULTS_TO_TSV: True
+    TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
+    GATHER_ON_CPU: False
+OUTPUT_DIR: "./output/vilc4_test"
+DATA_DIR: "./datasets"
+DISTRIBUTED_BACKEND: 'nccl'
--- a/sgg_configs/vgattr/vinvl_x152c4.yaml
+++ b/sgg_configs/vgattr/vinvl_x152c4.yaml
@ -28,9 +28,13 @@ INPUT:
    MAX_SIZE_TEST: 1000
    PIXEL_MEAN: [103.530, 116.280, 123.675]
 DATASETS:
-    FACTORY_TEST: ("ODTSVDataset",)
-    TEST: ("flickr30k/tsv/flickr30k.yaml",)
+#    FACTORY_TEST: ("ODTSVDataset",)
+#    TEST: ("flickr30k/tsv/flickr30k.yaml",)
    LABELMAP_FILE: "visualgenome/VG-SGG-dicts-vgoi6-clipped.json"
+    TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
+    TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
+    FACTORY_TRAIN: ("VGTSVDataset",)
+    FACTORY_TEST: ("VGTSVDataset",)
 DATALOADER:
    NUM_WORKERS: 0
 SOLVER:
--- a/sgg_configs/vrd/R152FPN_attr_vrd_reldn.yaml
+++ b/sgg_configs/vrd/R152FPN_attr_vrd_reldn.yaml
@ -1,7 +1,6 @@
 MODEL:
    META_ARCHITECTURE: "GeneralizedRCNN"
    WEIGHT: "pretrained_model/RX152FPN_reldn_oi_best.pth"
-    # WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
    USE_FREQ_PRIOR: False
    FREQ_PRIOR: "openimages_v5c/vrd/vrd_frequency_prior_include_background.npy"
    BACKBONE:
--- a/sgg_configs/vrd/R152FPN_vrd_reldn.yaml
+++ b/sgg_configs/vrd/R152FPN_vrd_reldn.yaml
@ -1,7 +1,6 @@
 MODEL:
    META_ARCHITECTURE: "SceneParser"
    WEIGHT: "pretrained_model/RX152FPN_reldn_oi_best.pth"
-    # WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
    USE_FREQ_PRIOR: False
    FREQ_PRIOR: "openimages_v5c/vrd/vrd_frequency_prior_include_background.npy"
    BACKBONE:
--- a/tools/test_net.py
+++ b/tools/test_net.py
@ -18,12 +18,6 @@ from maskrcnn_benchmark.utils.comm import synchronize, get_rank
 from maskrcnn_benchmark.utils.logger import setup_logger
 from maskrcnn_benchmark.utils.miscellaneous import mkdir

-# Check if we can enable mixed-precision via apex.amp
-try:
-    from apex import amp
-except ImportError:
-    raise ImportError('Use APEX for mixed precision via apex.amp')
-

 def main():
    parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
@ -73,10 +67,6 @@ def main():
    model = build_detection_model(cfg)
    model.to(cfg.MODEL.DEVICE)

-    # Initialize mixed-precision if necessary
-    use_mixed_precision = cfg.DTYPE == 'float16'
-    amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
-
    output_dir = cfg.OUTPUT_DIR
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
    ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
--- a/tools/test_sg_net.py
+++ b/tools/test_sg_net.py
@ -6,6 +6,7 @@ from maskrcnn_benchmark.utils.env import setup_environment  # noqa F401 isort:sk

 import argparse
 import os
+import json

 import torch
 from maskrcnn_benchmark.config import cfg
@ -21,11 +22,112 @@ from maskrcnn_benchmark.utils.comm import synchronize, get_rank
 from maskrcnn_benchmark.utils.logger import setup_logger
 from maskrcnn_benchmark.utils.miscellaneous import mkdir

-# Check if we can enable mixed-precision via apex.amp
-# try:
-#     from apex import amp
-# except ImportError:
-#     raise ImportError('Use APEX for mixed precision via apex.amp')
+
+def run_test(cfg, model, distributed, model_name):
+    if distributed and hasattr(model, 'module'):
+        model = model.module
+    torch.cuda.empty_cache()  # TODO check if it helps
+    iou_types = ("bbox",)
+    if cfg.MODEL.MASK_ON:
+        iou_types = iou_types + ("segm",)
+    if cfg.MODEL.KEYPOINT_ON:
+        iou_types = iou_types + ("keypoints",)
+    output_folders = [None] * len(cfg.DATASETS.TEST)
+    dataset_names = cfg.DATASETS.TEST
+    if cfg.OUTPUT_DIR:
+        if len(dataset_names) == 1:
+            output_folder = os.path.join(
+                cfg.OUTPUT_DIR, "inference",
+                os.path.splitext(model_name)[0]
+            )
+            mkdir(output_folder)
+            output_folders = [output_folder]
+        else:
+            for idx, dataset_name in enumerate(dataset_names):
+                dataset_name1 = dataset_name.replace('/', '_')
+                output_folder = os.path.join(
+                    cfg.OUTPUT_DIR, "inference",
+                    dataset_name1,
+                    os.path.splitext(model_name)[0]
+                )
+                mkdir(output_folder)
+                output_folders[idx] = output_folder
+    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
+    labelmap_file = config_dataset_file(cfg.DATA_DIR, cfg.DATASETS.LABELMAP_FILE)
+    for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
+        results = inference(
+            model,
+            cfg,
+            data_loader_val,
+            dataset_name=dataset_name,
+            iou_types=iou_types,
+            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
+            bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
+            device=cfg.MODEL.DEVICE,
+            expected_results=cfg.TEST.EXPECTED_RESULTS,
+            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+            output_folder=output_folder,
+            skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
+            labelmap_file=labelmap_file,
+            save_predictions=cfg.TEST.SAVE_PREDICTIONS,
+        )
+
+        # renaming box_proposals metric to rpn_proposals if RPN_ONLY is True
+        if results and 'box_proposal' in results and cfg.MODEL.RPN_ONLY:
+            results['rpn_proposal'] = results.pop('box_proposal')
+
+        if results and output_folder:
+            results_path = os.path.join(output_folder, "results.json")
+            # checking if this file already exists and only updating tasks
+            # that are already present. This is useful for including
+            # e.g. RPN_ONLY metrics
+            if os.path.isfile(results_path):
+                with open(results_path, 'rt') as fin:
+                    old_results = json.load(fin)
+                old_results.update(results)
+                results = old_results
+            with open(results_path, 'wt') as fout:
+                json.dump(results, fout)
+
+        synchronize()
+
+    # evaluate attribute detection
+    if not cfg.MODEL.RPN_ONLY and cfg.MODEL.ATTRIBUTE_ON and (not cfg.TEST.SKIP_PERFORMANCE_EVAL):
+        data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
+        for output_folder, dataset_name, data_loader_val in zip(
+            output_folders, dataset_names, data_loaders_val
+        ):
+            results_attr = inference(
+                model,
+                cfg,
+                data_loader_val,
+                dataset_name=dataset_name,
+                iou_types=iou_types,
+                box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
+                device=cfg.MODEL.DEVICE,
+                expected_results=cfg.TEST.EXPECTED_RESULTS,
+                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+                output_folder=output_folder,
+                skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
+                labelmap_file=labelmap_file,
+                save_predictions=cfg.TEST.SAVE_PREDICTIONS,
+                eval_attributes=True,
+            )
+
+            if results_attr and output_folder:
+                results_path = os.path.join(output_folder, "results.json")
+                # checking if this file already exists and only updating tasks
+                # that are already present. This is useful for including
+                # e.g. RPN_ONLY metrics
+                if os.path.isfile(results_path):
+                    with open(results_path, 'rt') as fin:
+                        old_results = json.load(fin)
+                    old_results.update(results_attr)
+                    results_attr = old_results
+                with open(results_path, 'wt') as fout:
+                    json.dump(results_attr, fout)
+
+            synchronize()


 def main():
@ -52,7 +154,7 @@ def main():
    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
-    distributed = num_gpus > 1
+    args.distributed = num_gpus > 1

    cfg.set_new_allowed(True)
    cfg.merge_from_other_cfg(sg_cfg)
@ -61,7 +163,7 @@ def main():
    cfg.merge_from_list(args.opts)
    cfg.freeze()

-    if distributed:
+    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend=cfg.DISTRIBUTED_BACKEND, init_method="env://"
@ -82,47 +184,13 @@ def main():
        model = AttrRCNN(cfg)
    model.to(cfg.MODEL.DEVICE)

-    # Initialize mixed-precision if necessary
-    # use_mixed_precision = cfg.DTYPE == 'float16'
-    # amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
-
    output_dir = cfg.OUTPUT_DIR
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
    ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
    _ = checkpointer.load(ckpt, use_latest=args.ckpt is None)
+    model_name = os.path.basename(ckpt)

-    iou_types = ("bbox",)
-    if cfg.MODEL.MASK_ON:
-        iou_types = iou_types + ("segm",)
-    if cfg.MODEL.KEYPOINT_ON:
-        iou_types = iou_types + ("keypoints",)
-    output_folders = [None] * len(cfg.DATASETS.TEST)
-    dataset_names = cfg.DATASETS.TEST
-    if cfg.OUTPUT_DIR:
-        for idx, dataset_name in enumerate(dataset_names):
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
-            mkdir(output_folder)
-            output_folders[idx] = output_folder
-    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
-    labelmap_file = config_dataset_file(cfg.DATA_DIR, cfg.DATASETS.LABELMAP_FILE)
-    for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
-        inference(
-            model,
-            cfg,
-            data_loader_val,
-            dataset_name=dataset_name,
-            iou_types=iou_types,
-            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
-            bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
-            device=cfg.MODEL.DEVICE,
-            expected_results=cfg.TEST.EXPECTED_RESULTS,
-            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
-            output_folder=output_folder,
-            skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
-            labelmap_file=labelmap_file,
-            save_predictions=cfg.TEST.SAVE_PREDICTIONS,
-        )
-        synchronize()
+    run_test(cfg, model, args.distributed, model_name)


 if __name__ == "__main__":
--- a/tools/train_sg_net.py
+++ b/tools/train_sg_net.py
@ -15,12 +15,14 @@ import torch
 from maskrcnn_benchmark.config import cfg
 from scene_graph_benchmark.config import sg_cfg
 from maskrcnn_benchmark.data import make_data_loader
+from maskrcnn_benchmark.data.datasets.utils.load_files import config_dataset_file
 from maskrcnn_benchmark.solver import make_lr_scheduler
-from maskrcnn_benchmark.solver import make_optimizer
+from maskrcnn_benchmark.solver import make_optimizer, make_optimizer_d2
 from maskrcnn_benchmark.engine.inference import inference
 from maskrcnn_benchmark.engine.trainer import do_train
 from maskrcnn_benchmark.modeling.detector import build_detection_model
 from scene_graph_benchmark.scene_parser import SceneParser
+from scene_graph_benchmark.AttrRCNN import AttrRCNN
 from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
 from maskrcnn_benchmark.utils.collect_env import collect_env_info
 from maskrcnn_benchmark.utils.comm import synchronize, get_rank
@ -28,13 +30,7 @@ from maskrcnn_benchmark.utils.imports import import_file
 from maskrcnn_benchmark.utils.logger import setup_logger
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger
 from maskrcnn_benchmark.utils.miscellaneous import mkdir, save_config
-
-# See if we can use apex.DistributedDataParallel instead of the torch default,
-# and enable mixed-precision via apex.amp
-try:
-    from apex import amp
-except ImportError:
-    raise ImportError('Use APEX for multi-precision via apex.amp')
+from tools.test_sg_net import run_test

 import random
 import numpy as np
@ -50,23 +46,24 @@ torch.backends.cudnn.deterministic = True


 def train(cfg, local_rank, distributed):
-    model = SceneParser(cfg)
+    if cfg.MODEL.META_ARCHITECTURE == "SceneParser":
+        model = SceneParser(cfg)
+    elif cfg.MODEL.META_ARCHITECTURE == "AttrRCNN":
+        model = AttrRCNN(cfg)
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

-    optimizer = make_optimizer(cfg, model)
+    if cfg.MODEL.BACKBONE.CONV_BODY.startswith("ViL"):
+        optimizer = make_optimizer_d2(cfg, model)
+    else:
+        optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

-    # # Initialize mixed-precision training
-    # use_mixed_precision = cfg.DTYPE == "float16"
-    # amp_opt_level = 'O1' if use_mixed_precision else 'O0'
-    # model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
-
    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
-            broadcast_buffers=False,
+            broadcast_buffers=False, find_unused_parameters=True
        )

    arguments = {}
@ -116,39 +113,6 @@ def train(cfg, local_rank, distributed):
    return model


-def run_test(cfg, model, distributed):
-    if distributed:
-        model = model.module
-    torch.cuda.empty_cache()  # TODO check if it helps
-    iou_types = ("bbox",)
-    if cfg.MODEL.MASK_ON:
-        iou_types = iou_types + ("segm",)
-    if cfg.MODEL.KEYPOINT_ON:
-        iou_types = iou_types + ("keypoints",)
-    output_folders = [None] * len(cfg.DATASETS.TEST)
-    dataset_names = cfg.DATASETS.TEST
-    if cfg.OUTPUT_DIR:
-        for idx, dataset_name in enumerate(dataset_names):
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
-            mkdir(output_folder)
-            output_folders[idx] = output_folder
-    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
-    for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
-        inference(
-            model,
-            data_loader_val,
-            dataset_name=dataset_name,
-            iou_types=iou_types,
-            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
-            bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
-            device=cfg.MODEL.DEVICE,
-            expected_results=cfg.TEST.EXPECTED_RESULTS,
-            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
-            output_folder=output_folder,
-        )
-        synchronize()
-
-
 def main():
    parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
    parser.add_argument(
@ -216,7 +180,7 @@ def main():
    model = train(cfg, args.local_rank, args.distributed)

    if not args.skip_test:
-        run_test(cfg, model, args.distributed)
+        run_test(cfg, model, args.distributed, model_name="model_final")


 if __name__ == "__main__":