Коммит
a93180e85c
13
INSTALL.md
13
INSTALL.md
|
@ -1,10 +1,11 @@
|
|||
## Installation
|
||||
|
||||
### Requirements:
|
||||
- PyTorch 1.4
|
||||
- PyTorch 1.7
|
||||
- torchvision
|
||||
- cocoapi
|
||||
- yacs
|
||||
- yacs>=0.1.8
|
||||
- numpy>=1.19.5
|
||||
- matplotlib
|
||||
- GCC >= 4.9
|
||||
- OpenCV
|
||||
|
@ -25,9 +26,10 @@ conda activate sg_benchmark
|
|||
conda install ipython h5py nltk joblib jupyter pandas scipy
|
||||
|
||||
# maskrcnn_benchmark and coco api dependencies
|
||||
pip install ninja yacs==0.1.8 cython matplotlib tqdm opencv-python numpy=1.19.5
|
||||
pip install ninja yacs>=0.1.8 cython matplotlib tqdm opencv-python numpy>=1.19.5
|
||||
|
||||
conda install pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch
|
||||
conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch
|
||||
conda install -c conda-forge timm einops
|
||||
|
||||
# install pycocotools
|
||||
conda install -c conda-forge pycocotools
|
||||
|
@ -35,9 +37,6 @@ conda install -c conda-forge pycocotools
|
|||
# install cityscapesScripts
|
||||
python -m pip install cityscapesscripts
|
||||
|
||||
# install apex
|
||||
conda install -c conda-forge nvidia-apex
|
||||
|
||||
# install Scene Graph Detection
|
||||
git clone https://github.com/microsoft/scene_graph_benchmark
|
||||
cd scene_graph_benchmark
|
||||
|
|
14
README.md
14
README.md
|
@ -1,15 +1,12 @@
|
|||
# Scene Graph Benchmark in PyTorch 1.4
|
||||
# Scene Graph Benchmark in PyTorch 1.7
|
||||
|
||||
**This project is based on [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark)**
|
||||
|
||||
This project aims at providing the necessary building blocks for easily
|
||||
creating detection and segmentation models using PyTorch 1.0.
|
||||
|
||||
![alt text](demo/R152FPN_demo.png "from https://storage.googleapis.com/openimages/web/index.html")
|
||||
|
||||
|
||||
## Highlights
|
||||
- **Upgrad to pytorch 1.4 (can also upgrade to 1.7)**
|
||||
- **Upgrad to pytorch 1.7**
|
||||
- **Multi-GPU training and inference**
|
||||
- **Batched inference:** can perform inference using multiple images per batch per GPU.
|
||||
- **Fast and flexible tsv dataset format**
|
||||
|
@ -35,15 +32,16 @@ Here is how we would do it. Run the following commands:
|
|||
# visualize VinVL object detection
|
||||
# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
|
||||
# the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file ../maskrcnn-benchmark-1/datasets1/imgs/woman_fish.jpg --save_file output/woman_fish_x152c4.obj.jpg MODEL.WEIGHT models/vinvl/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 DATA_DIR "../maskrcnn-benchmark-1/datasets1" TEST.IGNORE_BOX_REGRESSION False
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file demo/woman_fish.jpg --save_file output/woman_fish_x152c4.obj.jpg MODEL.WEIGHT pretrained_model/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 TEST.IGNORE_BOX_REGRESSION False
|
||||
|
||||
# visualize VinVL object-attribute detection
|
||||
# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
|
||||
# the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file ../maskrcnn-benchmark-1/datasets1/imgs/woman_fish.jpg --save_file output/woman_fish_x152c4.attr.jpg --visualize_attr MODEL.WEIGHT models/vinvl/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 DATA_DIR "../maskrcnn-benchmark-1/datasets1" TEST.IGNORE_BOX_REGRESSION False
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file demo/woman_fish.jpg --save_file output/woman_fish_x152c4.attr.jpg --visualize_attr MODEL.WEIGHT pretrained_model/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 TEST.IGNORE_BOX_REGRESSION False
|
||||
|
||||
# visualize OpenImage scene graph generation by RelDN
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vrd/R152FPN_vrd_reldn.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa_output.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False
|
||||
# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/sgg_model_zoo/sgg_oi_vrd_model_zoo/RX152FPN_reldn_oi_best.pth
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vrd/R152FPN_vrd_reldn.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file output/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.reldn_relation.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False
|
||||
|
||||
# visualize Visual Genome scene graph generation by neural motif
|
||||
python tools/demo/demo_image.py --config_file sgg_configs/vg_vrd/rel_danfeiX_FPN50_nm.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa_vgnm.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False DATASETS.LABELMAP_FILE "visualgenome/VG-SGG-dicts-danfeiX-clipped.json" DATA_DIR /home/penzhan/GitHub/maskrcnn-benchmark-1/datasets1 MODEL.ROI_RELATION_HEAD.USE_BIAS True MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP True MODEL.ROI_HEADS.DETECTIONS_PER_IMG 64 MODEL.ROI_RELATION_HEAD.SHARE_BOX_FEATURE_EXTRACTOR False MODEL.ROI_RELATION_HEAD.NEURAL_MOTIF.OBJ_LSTM_NUM_LAYERS 0 MODEL.ROI_RELATION_HEAD.NEURAL_MOTIF.EDGE_LSTM_NUM_LAYERS 2 TEST.IMS_PER_BATCH 2
|
||||
|
|
|
@ -273,7 +273,7 @@ class COCODemo(object):
|
|||
the BoxList via `prediction.fields()`
|
||||
"""
|
||||
scores = predictions.get_field("scores")
|
||||
keep = torch.nonzero(scores > self.confidence_threshold).squeeze(1)
|
||||
keep = torch.nonzero(scores > self.confidence_threshold, as_tuple=False).squeeze(1)
|
||||
predictions = predictions[keep]
|
||||
scores = predictions.get_field("scores")
|
||||
_, idx = scores.sort(0, descending=True)
|
||||
|
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 1.7 MiB |
|
@ -1,7 +1,7 @@
|
|||
ARG CUDA="10.1"
|
||||
ARG CUDNN="7"
|
||||
|
||||
FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04
|
||||
FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu18.04
|
||||
|
||||
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
|
||||
|
||||
|
@ -36,20 +36,20 @@ RUN pip --no-cache-dir install --force-reinstall -I pyyaml
|
|||
|
||||
RUN python -m nltk.downloader punkt
|
||||
|
||||
# Install latest PyTorch 1.4
|
||||
# Install latest PyTorch 1.7.1
|
||||
ARG CUDA
|
||||
RUN conda install pytorch~=1.4.0 torchvision cudatoolkit=${CUDA} -c pytorch \
|
||||
RUN conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch \
|
||||
&& conda clean -ya
|
||||
RUN conda install -y -c conda-forge timm einops
|
||||
|
||||
# install pycocotools
|
||||
RUN git clone https://github.com/cocodataset/cocoapi.git \
|
||||
&& cd cocoapi/PythonAPI \
|
||||
&& python setup.py build_ext install
|
||||
# RUN git clone https://github.com/cocodataset/cocoapi.git \
|
||||
# && cd cocoapi/PythonAPI \
|
||||
# && python setup.py build_ext install
|
||||
RUN conda install -y -c conda-forge pycocotools
|
||||
|
||||
# install apex
|
||||
RUN git clone https://github.com/NVIDIA/apex.git \
|
||||
&& cd apex \
|
||||
&& python setup.py install --cuda_ext --cpp_ext
|
||||
# install cityscapesScripts
|
||||
RUN python -m pip install cityscapesscripts
|
||||
|
||||
# install PyTorch Detection
|
||||
ARG FORCE_CUDA="1"
|
||||
|
@ -61,7 +61,7 @@ RUN echo """syntax on\nfiletype indent on\nset autoindent\nset number\ncolorsche
|
|||
|
||||
CMD [ "zsh" ]
|
||||
|
||||
# RUN git clone https://github.com/hanxiaotian/scene_graph_benchmark.git \
|
||||
# RUN git clone https://github.com/microsoft/scene_graph_benchmark.git \
|
||||
# && cd scene_graph_benchmark \
|
||||
# && python setup.py build develop
|
||||
|
||||
|
|
|
@ -56,11 +56,6 @@ RUN git clone https://github.com/cocodataset/cocoapi.git \
|
|||
&& cd cocoapi/PythonAPI \
|
||||
&& python setup.py build_ext install
|
||||
|
||||
# install apex
|
||||
RUN git clone https://github.com/NVIDIA/apex.git \
|
||||
&& cd apex \
|
||||
&& python setup.py install --cuda_ext --cpp_ext
|
||||
|
||||
# install PyTorch Detection
|
||||
ARG FORCE_CUDA="1"
|
||||
ENV FORCE_CUDA=${FORCE_CUDA}
|
||||
|
|
|
@ -178,6 +178,8 @@ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
|
|||
_C.MODEL.RPN.FPN_POST_NMS_PER_BATCH = True
|
||||
# Custom rpn head, empty to use default conv or separable conv
|
||||
_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
|
||||
# use gt target box as proposals for roi_heads (shared in training and testing)
|
||||
_C.MODEL.RPN.FORCE_BOXES = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------- #
|
||||
|
@ -302,6 +304,28 @@ _C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
|
|||
_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
|
||||
_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
|
||||
|
||||
# ---------------------------------------------------------------------------- #
|
||||
# Vision Transformer Options
|
||||
# ---------------------------------------------------------------------------- #
|
||||
_C.MODEL.TRANSFORMER = CN()
|
||||
_C.MODEL.TRANSFORMER.DROP = 0.0
|
||||
_C.MODEL.TRANSFORMER.DROP_PATH = 0.1
|
||||
_C.MODEL.TRANSFORMER.NORM_EMBED = True
|
||||
_C.MODEL.TRANSFORMER.AVG_POOL = False
|
||||
_C.MODEL.TRANSFORMER.VITHEADARCH = 'l4,h12,d768,n1,s0,g0,p2,f7,a0'
|
||||
|
||||
_C.MODEL.TRANSFORMER.MSVIT = CN()
|
||||
_C.MODEL.TRANSFORMER.MSVIT.ARCH = 'l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n2,s1,g1,p2,f7,a0_l3,h6,d384,n8,s1,g1,p2,f7,a0_l4,h12,d768,n1,s1,g0,p2,f7,a0'
|
||||
_C.MODEL.TRANSFORMER.MSVIT.SHARE_W = True
|
||||
_C.MODEL.TRANSFORMER.MSVIT.ATTN_TYPE = 'longformerhand'
|
||||
_C.MODEL.TRANSFORMER.MSVIT.SHARE_KV = True
|
||||
_C.MODEL.TRANSFORMER.MSVIT.ONLY_GLOBAL = False
|
||||
_C.MODEL.TRANSFORMER.MSVIT.SW_EXACT = 0
|
||||
_C.MODEL.TRANSFORMER.MSVIT.LN_EPS = 1e-6
|
||||
_C.MODEL.TRANSFORMER.MSVIT.MODE = 0
|
||||
_C.MODEL.TRANSFORMER.MSVIT.REDRAW_INTERVAL = 1000
|
||||
|
||||
_C.MODEL.TRANSFORMER.OUT_FEATURES = []
|
||||
|
||||
# ---------------------------------------------------------------------------- #
|
||||
# RetinaNet Options (Follow the Detectron version)
|
||||
|
@ -430,6 +454,15 @@ _C.SOLVER.TEST_PERIOD = 0
|
|||
# see 2 images per batch
|
||||
_C.SOLVER.IMS_PER_BATCH = 16
|
||||
|
||||
_C.SOLVER.USE_AMP = False
|
||||
|
||||
_C.SOLVER.OPTIMIZER = 'SGD' # also support ADAMW
|
||||
_C.SOLVER.CLIP_GRADIENTS = CN()
|
||||
_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
|
||||
_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
|
||||
_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
|
||||
_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
|
||||
|
||||
# ---------------------------------------------------------------------------- #
|
||||
# Specific test options
|
||||
# ---------------------------------------------------------------------------- #
|
||||
|
@ -487,6 +520,7 @@ _C.TEST.IGNORE_BOX_REGRESSION = False
|
|||
_C.OUTPUT_DIR = "."
|
||||
_C.DATA_DIR = "./datasets"
|
||||
_C.DISTRIBUTED_BACKEND = "nccl" # could be "nccl", "gloo" or "mpi"
|
||||
_C.LOG_LOSS_PERIOD = 20
|
||||
|
||||
_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
|
||||
|
||||
|
@ -496,6 +530,3 @@ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
|
|||
|
||||
# Precision of input, allowable: (float32, float16)
|
||||
_C.DTYPE = "float32"
|
||||
|
||||
# Enable verbosity in apex.amp
|
||||
_C.AMP_VERBOSE = False
|
||||
|
|
|
@ -14,7 +14,7 @@ at::Tensor ROIAlign_forward(const at::Tensor& input,
|
|||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int sampling_ratio) {
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
|
||||
#else
|
||||
|
@ -34,7 +34,7 @@ at::Tensor ROIAlign_backward(const at::Tensor& grad,
|
|||
const int height,
|
||||
const int width,
|
||||
const int sampling_ratio) {
|
||||
if (grad.type().is_cuda()) {
|
||||
if (grad.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
|
||||
#else
|
||||
|
|
|
@ -13,7 +13,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
|
|||
const float spatial_scale,
|
||||
const int pooled_height,
|
||||
const int pooled_width) {
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
|
||||
#else
|
||||
|
@ -34,7 +34,7 @@ at::Tensor ROIPool_backward(const at::Tensor& grad,
|
|||
const int channels,
|
||||
const int height,
|
||||
const int width) {
|
||||
if (grad.type().is_cuda()) {
|
||||
if (grad.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
|
||||
#else
|
||||
|
|
|
@ -13,7 +13,7 @@ at::Tensor SigmoidFocalLoss_forward(
|
|||
const int num_classes,
|
||||
const float gamma,
|
||||
const float alpha) {
|
||||
if (logits.type().is_cuda()) {
|
||||
if (logits.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
|
||||
#else
|
||||
|
@ -30,7 +30,7 @@ at::Tensor SigmoidFocalLoss_backward(
|
|||
const int num_classes,
|
||||
const float gamma,
|
||||
const float alpha) {
|
||||
if (logits.type().is_cuda()) {
|
||||
if (logits.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
|
||||
#else
|
||||
|
|
|
@ -91,7 +91,7 @@ void pre_calc_for_bilinear_interpolate(
|
|||
T hy = 1. - ly, hx = 1. - lx;
|
||||
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
|
||||
|
||||
// save weights and indices
|
||||
// save weights and indeces
|
||||
PreCalc<T> pc;
|
||||
pc.pos1 = y_low * width + x_low;
|
||||
pc.pos2 = y_low * width + x_high;
|
||||
|
@ -168,8 +168,8 @@ void ROIAlignForward_cpu_kernel(
|
|||
// We do average (integral) pooling inside a bin
|
||||
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
|
||||
|
||||
// we want to precalculate indices and weights shared by all channels,
|
||||
// this is the key point of optimization
|
||||
// we want to precalculate indeces and weights shared by all chanels,
|
||||
// this is the key point of optimiation
|
||||
std::vector<PreCalc<T>> pre_calc(
|
||||
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
|
||||
pre_calc_for_bilinear_interpolate(
|
||||
|
@ -224,8 +224,8 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
|
|||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int sampling_ratio) {
|
||||
AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
|
||||
AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
|
||||
AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
|
||||
AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
|
||||
|
||||
auto num_rois = rois.size(0);
|
||||
auto channels = input.size(1);
|
||||
|
@ -239,10 +239,10 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
|
|||
return output;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
|
||||
ROIAlignForward_cpu_kernel<scalar_t>(
|
||||
output_size,
|
||||
input.data<scalar_t>(),
|
||||
input.data_ptr<scalar_t>(),
|
||||
spatial_scale,
|
||||
channels,
|
||||
height,
|
||||
|
@ -250,8 +250,8 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
|
|||
pooled_height,
|
||||
pooled_width,
|
||||
sampling_ratio,
|
||||
rois.data<scalar_t>(),
|
||||
output.data<scalar_t>());
|
||||
rois.data_ptr<scalar_t>(),
|
||||
output.data_ptr<scalar_t>());
|
||||
});
|
||||
return output;
|
||||
}
|
||||
|
|
|
@ -6,8 +6,8 @@ template <typename scalar_t>
|
|||
at::Tensor nms_cpu_kernel(const at::Tensor& dets,
|
||||
const at::Tensor& scores,
|
||||
const float threshold) {
|
||||
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
|
||||
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
|
||||
AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
|
||||
AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
|
||||
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
|
||||
|
||||
if (dets.numel() == 0) {
|
||||
|
@ -26,13 +26,13 @@ at::Tensor nms_cpu_kernel(const at::Tensor& dets,
|
|||
auto ndets = dets.size(0);
|
||||
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
|
||||
|
||||
auto suppressed = suppressed_t.data<uint8_t>();
|
||||
auto order = order_t.data<int64_t>();
|
||||
auto x1 = x1_t.data<scalar_t>();
|
||||
auto y1 = y1_t.data<scalar_t>();
|
||||
auto x2 = x2_t.data<scalar_t>();
|
||||
auto y2 = y2_t.data<scalar_t>();
|
||||
auto areas = areas_t.data<scalar_t>();
|
||||
auto suppressed = suppressed_t.data_ptr<uint8_t>();
|
||||
auto order = order_t.data_ptr<int64_t>();
|
||||
auto x1 = x1_t.data_ptr<scalar_t>();
|
||||
auto y1 = y1_t.data_ptr<scalar_t>();
|
||||
auto x2 = x2_t.data_ptr<scalar_t>();
|
||||
auto y2 = y2_t.data_ptr<scalar_t>();
|
||||
auto areas = areas_t.data_ptr<scalar_t>();
|
||||
|
||||
for (int64_t _i = 0; _i < ndets; _i++) {
|
||||
auto i = order[_i];
|
||||
|
@ -68,7 +68,7 @@ at::Tensor nms_cpu(const at::Tensor& dets,
|
|||
const at::Tensor& scores,
|
||||
const float threshold) {
|
||||
at::Tensor result;
|
||||
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
|
||||
result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
|
||||
});
|
||||
return result;
|
||||
|
|
|
@ -260,8 +260,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
|
|||
const int pooled_height,
|
||||
const int pooled_width,
|
||||
const int sampling_ratio) {
|
||||
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
||||
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
|
||||
|
||||
auto num_rois = rois.size(0);
|
||||
auto channels = input.size(1);
|
||||
|
@ -272,7 +272,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
|
|||
auto output_size = num_rois * pooled_height * pooled_width * channels;
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
|
||||
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
|
||||
dim3 block(512);
|
||||
|
||||
if (output.numel() == 0) {
|
||||
|
@ -280,10 +280,10 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
|
|||
return output;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
|
||||
RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
|
||||
output_size,
|
||||
input.contiguous().data<scalar_t>(),
|
||||
input.contiguous().data_ptr<scalar_t>(),
|
||||
spatial_scale,
|
||||
channels,
|
||||
height,
|
||||
|
@ -291,8 +291,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
|
|||
pooled_height,
|
||||
pooled_width,
|
||||
sampling_ratio,
|
||||
rois.contiguous().data<scalar_t>(),
|
||||
output.data<scalar_t>());
|
||||
rois.contiguous().data_ptr<scalar_t>(),
|
||||
output.data_ptr<scalar_t>());
|
||||
});
|
||||
THCudaCheck(cudaGetLastError());
|
||||
return output;
|
||||
|
@ -309,15 +309,15 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
|
|||
const int height,
|
||||
const int width,
|
||||
const int sampling_ratio) {
|
||||
AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
||||
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
|
||||
|
||||
auto num_rois = rois.size(0);
|
||||
auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
|
||||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
|
||||
dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
|
||||
dim3 block(512);
|
||||
|
||||
// handle possibly empty gradients
|
||||
|
@ -326,10 +326,10 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
|
|||
return grad_input;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
|
||||
RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
|
||||
grad.numel(),
|
||||
grad.contiguous().data<scalar_t>(),
|
||||
grad.contiguous().data_ptr<scalar_t>(),
|
||||
num_rois,
|
||||
spatial_scale,
|
||||
channels,
|
||||
|
@ -338,8 +338,8 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
|
|||
pooled_height,
|
||||
pooled_width,
|
||||
sampling_ratio,
|
||||
grad_input.data<scalar_t>(),
|
||||
rois.contiguous().data<scalar_t>());
|
||||
grad_input.data_ptr<scalar_t>(),
|
||||
rois.contiguous().data_ptr<scalar_t>());
|
||||
});
|
||||
THCudaCheck(cudaGetLastError());
|
||||
return grad_input;
|
||||
|
|
|
@ -112,8 +112,8 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
|
|||
const float spatial_scale,
|
||||
const int pooled_height,
|
||||
const int pooled_width) {
|
||||
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
||||
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
|
||||
|
||||
auto num_rois = rois.size(0);
|
||||
auto channels = input.size(1);
|
||||
|
@ -126,7 +126,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
|
|||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
|
||||
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
|
||||
dim3 block(512);
|
||||
|
||||
if (output.numel() == 0) {
|
||||
|
@ -134,19 +134,19 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
|
|||
return std::make_tuple(output, argmax);
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
|
||||
RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
|
||||
output_size,
|
||||
input.contiguous().data<scalar_t>(),
|
||||
input.contiguous().data_ptr<scalar_t>(),
|
||||
spatial_scale,
|
||||
channels,
|
||||
height,
|
||||
width,
|
||||
pooled_height,
|
||||
pooled_width,
|
||||
rois.contiguous().data<scalar_t>(),
|
||||
output.data<scalar_t>(),
|
||||
argmax.data<int>());
|
||||
rois.contiguous().data_ptr<scalar_t>(),
|
||||
output.data_ptr<scalar_t>(),
|
||||
argmax.data_ptr<int>());
|
||||
});
|
||||
THCudaCheck(cudaGetLastError());
|
||||
return std::make_tuple(output, argmax);
|
||||
|
@ -164,8 +164,8 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
|
|||
const int channels,
|
||||
const int height,
|
||||
const int width) {
|
||||
AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
|
||||
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
|
||||
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
|
||||
// TODO add more checks
|
||||
|
||||
auto num_rois = rois.size(0);
|
||||
|
@ -173,7 +173,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
|
|||
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
|
||||
dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
|
||||
dim3 block(512);
|
||||
|
||||
// handle possibly empty gradients
|
||||
|
@ -182,11 +182,11 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
|
|||
return grad_input;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
|
||||
RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
|
||||
grad.numel(),
|
||||
grad.contiguous().data<scalar_t>(),
|
||||
argmax.data<int>(),
|
||||
grad.contiguous().data_ptr<scalar_t>(),
|
||||
argmax.data_ptr<int>(),
|
||||
num_rois,
|
||||
spatial_scale,
|
||||
channels,
|
||||
|
@ -194,8 +194,8 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
|
|||
width,
|
||||
pooled_height,
|
||||
pooled_width,
|
||||
grad_input.data<scalar_t>(),
|
||||
rois.contiguous().data<scalar_t>());
|
||||
grad_input.data_ptr<scalar_t>(),
|
||||
rois.contiguous().data_ptr<scalar_t>());
|
||||
});
|
||||
THCudaCheck(cudaGetLastError());
|
||||
return grad_input;
|
||||
|
|
|
@ -107,8 +107,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
|
|||
const int num_classes,
|
||||
const float gamma,
|
||||
const float alpha) {
|
||||
AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
|
||||
AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
|
||||
AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
|
||||
AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
|
||||
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
|
||||
|
||||
const int num_samples = logits.size(0);
|
||||
|
@ -117,8 +117,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
|
|||
auto losses_size = num_samples * logits.size(1);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L));
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
|
||||
dim3 block(512);
|
||||
|
||||
if (losses.numel() == 0) {
|
||||
|
@ -126,16 +125,16 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
|
|||
return losses;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
|
||||
SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
|
||||
losses_size,
|
||||
logits.contiguous().data<scalar_t>(),
|
||||
targets.contiguous().data<int>(),
|
||||
logits.contiguous().data_ptr<scalar_t>(),
|
||||
targets.contiguous().data_ptr<int>(),
|
||||
num_classes,
|
||||
gamma,
|
||||
alpha,
|
||||
num_samples,
|
||||
losses.data<scalar_t>());
|
||||
losses.data_ptr<scalar_t>());
|
||||
});
|
||||
THCudaCheck(cudaGetLastError());
|
||||
return losses;
|
||||
|
@ -149,9 +148,9 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
|
|||
const int num_classes,
|
||||
const float gamma,
|
||||
const float alpha) {
|
||||
AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
|
||||
AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
|
||||
AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
|
||||
AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
|
||||
AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
|
||||
AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
|
||||
|
||||
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
|
||||
|
||||
|
@ -162,7 +161,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
|
|||
auto d_logits_size = num_samples * logits.size(1);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L));
|
||||
dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
|
||||
dim3 block(512);
|
||||
|
||||
if (d_logits.numel() == 0) {
|
||||
|
@ -170,17 +169,17 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
|
|||
return d_logits;
|
||||
}
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
|
||||
AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
|
||||
SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
|
||||
d_logits_size,
|
||||
logits.contiguous().data<scalar_t>(),
|
||||
targets.contiguous().data<int>(),
|
||||
d_losses.contiguous().data<scalar_t>(),
|
||||
logits.contiguous().data_ptr<scalar_t>(),
|
||||
targets.contiguous().data_ptr<int>(),
|
||||
d_losses.contiguous().data_ptr<scalar_t>(),
|
||||
num_classes,
|
||||
gamma,
|
||||
alpha,
|
||||
num_samples,
|
||||
d_logits.data<scalar_t>());
|
||||
d_logits.data_ptr<scalar_t>());
|
||||
});
|
||||
|
||||
THCudaCheck(cudaGetLastError());
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
// modify from
|
||||
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
|
||||
|
||||
#ifndef AT_CHECK
|
||||
#define AT_CHECK TORCH_CHECK
|
||||
#endif
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
|
@ -73,26 +69,26 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
|
|||
int padW, int dilationH, int dilationW, int group,
|
||||
int deformable_group)
|
||||
{
|
||||
AT_CHECK(weight.ndimension() == 4,
|
||||
TORCH_CHECK(weight.ndimension() == 4,
|
||||
"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
|
||||
"but got: %s",
|
||||
weight.ndimension());
|
||||
|
||||
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
|
||||
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
|
||||
|
||||
AT_CHECK(kW > 0 && kH > 0,
|
||||
TORCH_CHECK(kW > 0 && kH > 0,
|
||||
"kernel size should be greater than zero, but got kH: %d kW: %d", kH,
|
||||
kW);
|
||||
|
||||
AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
|
||||
TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
|
||||
"kernel size should be consistent with weight, ",
|
||||
"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
|
||||
kW, weight.size(2), weight.size(3));
|
||||
|
||||
AT_CHECK(dW > 0 && dH > 0,
|
||||
TORCH_CHECK(dW > 0 && dH > 0,
|
||||
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
|
||||
|
||||
AT_CHECK(
|
||||
TORCH_CHECK(
|
||||
dilationW > 0 && dilationH > 0,
|
||||
"dilation should be greater than 0, but got dilationH: %d dilationW: %d",
|
||||
dilationH, dilationW);
|
||||
|
@ -108,7 +104,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
|
|||
dimw++;
|
||||
}
|
||||
|
||||
AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
|
||||
TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
|
||||
ndim);
|
||||
|
||||
long nInputPlane = weight.size(1) * group;
|
||||
|
@ -120,7 +116,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
|
|||
long outputWidth =
|
||||
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
|
||||
|
||||
AT_CHECK(nInputPlane % deformable_group == 0,
|
||||
TORCH_CHECK(nInputPlane % deformable_group == 0,
|
||||
"input channels must divide deformable group size");
|
||||
|
||||
if (outputWidth < 1 || outputHeight < 1)
|
||||
|
@ -130,27 +126,27 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
|
|||
nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
|
||||
outputWidth);
|
||||
|
||||
AT_CHECK(input.size(1) == nInputPlane,
|
||||
TORCH_CHECK(input.size(1) == nInputPlane,
|
||||
"invalid number of input planes, expected: %d, but got: %d",
|
||||
nInputPlane, input.size(1));
|
||||
|
||||
AT_CHECK((inputHeight >= kH && inputWidth >= kW),
|
||||
TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
|
||||
"input image is smaller than kernel");
|
||||
|
||||
AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
|
||||
TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
|
||||
"invalid spatial size of offset, expected height: %d width: %d, but "
|
||||
"got height: %d width: %d",
|
||||
outputHeight, outputWidth, offset.size(2), offset.size(3));
|
||||
|
||||
AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
|
||||
TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
|
||||
"invalid number of channels of offset");
|
||||
|
||||
if (gradOutput != NULL) {
|
||||
AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
|
||||
TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
|
||||
"invalid number of gradOutput planes, expected: %d, but got: %d",
|
||||
nOutputPlane, gradOutput->size(dimf));
|
||||
|
||||
AT_CHECK((gradOutput->size(dimh) == outputHeight &&
|
||||
TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
|
||||
gradOutput->size(dimw) == outputWidth),
|
||||
"invalid size of gradOutput, expected height: %d width: %d , but "
|
||||
"got height: %d width: %d",
|
||||
|
@ -201,7 +197,7 @@ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
|
|||
long outputHeight =
|
||||
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
|
||||
|
||||
AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
|
||||
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
|
||||
|
||||
output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
|
||||
outputHeight, outputWidth});
|
||||
|
@ -308,7 +304,7 @@ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
|
|||
long outputHeight =
|
||||
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
|
||||
|
||||
AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
|
||||
TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
|
||||
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
|
||||
columns = at::zeros(
|
||||
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
|
||||
|
@ -424,7 +420,7 @@ int deform_conv_backward_parameters_cuda(
|
|||
long outputHeight =
|
||||
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
|
||||
|
||||
AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
|
||||
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
|
||||
|
||||
columns = at::zeros(
|
||||
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
|
||||
|
@ -505,8 +501,8 @@ void modulated_deform_conv_cuda_forward(
|
|||
const int dilation_w, const int group, const int deformable_group,
|
||||
const bool with_bias)
|
||||
{
|
||||
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
|
||||
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
|
||||
|
||||
const int batch = input.size(0);
|
||||
const int channels = input.size(1);
|
||||
|
@ -587,8 +583,8 @@ void modulated_deform_conv_cuda_backward(
|
|||
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
|
||||
const bool with_bias)
|
||||
{
|
||||
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
|
||||
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
|
||||
|
||||
const int batch = input.size(0);
|
||||
const int channels = input.size(1);
|
||||
|
|
|
@ -264,10 +264,10 @@ void deformable_im2col(
|
|||
int channel_per_deformable_group = channels / deformable_group;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data_im.type(), "deformable_im2col_gpu", ([&] {
|
||||
const scalar_t *data_im_ = data_im.data<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
|
||||
scalar_t *data_col_ = data_col.data<scalar_t>();
|
||||
data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
|
||||
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
|
||||
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
|
||||
|
||||
deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
|
||||
num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
|
||||
|
@ -358,10 +358,10 @@ void deformable_col2im(
|
|||
int channel_per_deformable_group = channels / deformable_group;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data_col.type(), "deformable_col2im_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
|
||||
scalar_t *grad_im_ = grad_im.data<scalar_t>();
|
||||
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
|
||||
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
|
||||
|
||||
deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
|
||||
num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
|
||||
|
@ -456,11 +456,11 @@ void deformable_col2im_coord(
|
|||
int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data_col.type(), "deformable_col2im_coord_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data<scalar_t>();
|
||||
const scalar_t *data_im_ = data_im.data<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
|
||||
scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
|
||||
data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
|
||||
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
|
||||
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
|
||||
|
||||
deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
|
||||
num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
|
||||
|
@ -786,11 +786,11 @@ void modulated_deformable_im2col_cuda(
|
|||
const int num_kernels = channels * batch_size * height_col * width_col;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
|
||||
const scalar_t *data_im_ = data_im.data<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
|
||||
const scalar_t *data_mask_ = data_mask.data<scalar_t>();
|
||||
scalar_t *data_col_ = data_col.data<scalar_t>();
|
||||
data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
|
||||
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
|
||||
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
|
||||
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
|
||||
|
||||
modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
|
||||
num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
|
||||
|
@ -818,11 +818,11 @@ void modulated_deformable_col2im_cuda(
|
|||
const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
|
||||
const scalar_t *data_mask_ = data_mask.data<scalar_t>();
|
||||
scalar_t *grad_im_ = grad_im.data<scalar_t>();
|
||||
data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
|
||||
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
|
||||
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
|
||||
|
||||
modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
|
||||
num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
|
||||
|
@ -851,13 +851,13 @@ void modulated_deformable_col2im_coord_cuda(
|
|||
const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data<scalar_t>();
|
||||
const scalar_t *data_im_ = data_im.data<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
|
||||
const scalar_t *data_mask_ = data_mask.data<scalar_t>();
|
||||
scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
|
||||
scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
|
||||
data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
|
||||
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
|
||||
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
|
||||
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
|
||||
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
|
||||
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
|
||||
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
|
||||
|
||||
modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
|
||||
num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
|
||||
|
|
|
@ -5,10 +5,6 @@
|
|||
// author: Charles Shang
|
||||
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
|
||||
|
||||
#ifndef AT_CHECK
|
||||
#define AT_CHECK TORCH_CHECK
|
||||
#endif
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
|
@ -43,7 +39,7 @@ void deform_psroi_pooling_cuda_forward(
|
|||
const int output_dim, const int group_size, const int pooled_size,
|
||||
const int part_size, const int sample_per_part, const float trans_std)
|
||||
{
|
||||
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
|
||||
const int batch = input.size(0);
|
||||
const int channels = input.size(1);
|
||||
|
@ -69,8 +65,8 @@ void deform_psroi_pooling_cuda_backward(
|
|||
const int group_size, const int pooled_size, const int part_size,
|
||||
const int sample_per_part, const float trans_std)
|
||||
{
|
||||
AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
|
||||
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
|
||||
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
|
||||
|
||||
const int batch = input.size(0);
|
||||
const int channels = input.size(1);
|
||||
|
|
|
@ -290,12 +290,12 @@ void DeformablePSROIPoolForward(const at::Tensor data,
|
|||
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
data.type(), "deformable_psroi_pool_forward", ([&] {
|
||||
const scalar_t *bottom_data = data.data<scalar_t>();
|
||||
const scalar_t *bottom_rois = bbox.data<scalar_t>();
|
||||
const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
|
||||
scalar_t *top_data = out.data<scalar_t>();
|
||||
scalar_t *top_count_data = top_count.data<scalar_t>();
|
||||
data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
|
||||
const scalar_t *bottom_data = data.data_ptr<scalar_t>();
|
||||
const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
|
||||
const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
|
||||
scalar_t *top_data = out.data_ptr<scalar_t>();
|
||||
scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
|
||||
|
||||
DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
|
||||
count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
|
||||
|
@ -341,14 +341,14 @@ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
|
|||
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
||||
out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
|
||||
const scalar_t *top_diff = out_grad.data<scalar_t>();
|
||||
const scalar_t *bottom_data = data.data<scalar_t>();
|
||||
const scalar_t *bottom_rois = bbox.data<scalar_t>();
|
||||
const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
|
||||
scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
|
||||
scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
|
||||
const scalar_t *top_count_data = top_count.data<scalar_t>();
|
||||
out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
|
||||
const scalar_t *top_diff = out_grad.data_ptr<scalar_t>();
|
||||
const scalar_t *bottom_data = data.data_ptr<scalar_t>();
|
||||
const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
|
||||
const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
|
||||
scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>();
|
||||
scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>();
|
||||
const scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
|
||||
|
||||
DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
|
||||
count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
|
||||
|
|
|
@ -69,7 +69,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
|
|||
// boxes is a N x 5 tensor
|
||||
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
||||
using scalar_t = float;
|
||||
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
|
||||
AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
|
||||
auto scores = boxes.select(1, 4);
|
||||
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
||||
auto boxes_sorted = boxes.index_select(0, order_t);
|
||||
|
@ -78,7 +78,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
|||
|
||||
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
|
||||
|
||||
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
|
||||
scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
|
||||
|
||||
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
|
||||
|
||||
|
@ -106,7 +106,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
|||
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
|
||||
|
||||
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
|
||||
int64_t* keep_out = keep.data<int64_t>();
|
||||
int64_t* keep_out = keep.data_ptr<int64_t>();
|
||||
|
||||
int num_to_keep = 0;
|
||||
for (int i = 0; i < boxes_num; i++) {
|
||||
|
|
|
@ -27,7 +27,7 @@ int deform_conv_forward(
|
|||
int deformable_group,
|
||||
int im2col_step)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return deform_conv_forward_cuda(
|
||||
input, weight, offset, output, columns, ones,
|
||||
|
@ -62,7 +62,7 @@ int deform_conv_backward_input(
|
|||
int deformable_group,
|
||||
int im2col_step)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return deform_conv_backward_input_cuda(
|
||||
input, offset, gradOutput, gradInput, gradOffset, weight, columns,
|
||||
|
@ -97,7 +97,7 @@ int deform_conv_backward_parameters(
|
|||
float scale,
|
||||
int im2col_step)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return deform_conv_backward_parameters_cuda(
|
||||
input, offset, gradOutput, gradWeight, columns, ones,
|
||||
|
@ -133,7 +133,7 @@ void modulated_deform_conv_forward(
|
|||
const int deformable_group,
|
||||
const bool with_bias)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return modulated_deform_conv_cuda_forward(
|
||||
input, weight, bias, ones, offset, mask, output, columns,
|
||||
|
@ -175,7 +175,7 @@ void modulated_deform_conv_backward(
|
|||
int deformable_group,
|
||||
const bool with_bias)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return modulated_deform_conv_cuda_backward(
|
||||
input, weight, bias, ones, offset, mask, columns,
|
||||
|
|
|
@ -23,7 +23,7 @@ void deform_psroi_pooling_forward(
|
|||
const int sample_per_part,
|
||||
const float trans_std)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return deform_psroi_pooling_cuda_forward(
|
||||
input, bbox, trans, out, top_count,
|
||||
|
@ -55,7 +55,7 @@ void deform_psroi_pooling_backward(
|
|||
const int sample_per_part,
|
||||
const float trans_std)
|
||||
{
|
||||
if (input.type().is_cuda()) {
|
||||
if (input.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
return deform_psroi_pooling_cuda_backward(
|
||||
out_grad, input, bbox, trans, top_count, input_grad, trans_grad,
|
||||
|
|
|
@ -11,7 +11,7 @@ at::Tensor nms(const at::Tensor& dets,
|
|||
const at::Tensor& scores,
|
||||
const float threshold) {
|
||||
|
||||
if (dets.type().is_cuda()) {
|
||||
if (dets.device().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
// TODO raise error if not compiled with CUDA
|
||||
if (dets.numel() == 0)
|
||||
|
|
|
@ -14,7 +14,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|||
m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
|
||||
m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
|
||||
m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
|
||||
// dcn-v2
|
||||
m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
|
||||
m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
|
||||
m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
|
||||
|
@ -22,4 +21,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|||
m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
|
||||
m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
|
||||
m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,6 +6,8 @@ from .voc import voc_evaluation
|
|||
from .cityscapes import abs_cityscapes_evaluation
|
||||
from .sg import sg_evaluation
|
||||
from .openimages_vrd import openimages_vrd_evaluation
|
||||
from .vg import vg_evaluation
|
||||
|
||||
|
||||
def evaluate(dataset, predictions, output_folder, **kwargs):
|
||||
"""evaluate dataset using different methods based on dataset type.
|
||||
|
@ -28,7 +30,10 @@ def evaluate(dataset, predictions, output_folder, **kwargs):
|
|||
elif isinstance(dataset, datasets.OpenImagesVRDTSVDataset):
|
||||
return openimages_vrd_evaluation(**args)
|
||||
elif isinstance(dataset, datasets.VGTSVDataset):
|
||||
return sg_evaluation(**args)
|
||||
if 'sg_eval' in args and args['sg_eval']:
|
||||
return sg_evaluation(**args)
|
||||
else:
|
||||
return vg_evaluation(**args)
|
||||
elif isinstance(dataset, datasets.AbstractDataset):
|
||||
return abs_cityscapes_evaluation(**args)
|
||||
else:
|
||||
|
|
|
@ -120,18 +120,18 @@ def evaluate(gt_classes, gt_boxes, gt_rels,
|
|||
return (None, None)
|
||||
|
||||
rel_sum = ((gt_rels.sum(1) > 0).int() + (gt_rels.sum(0) > 0).int())
|
||||
ix_w_rel = rel_sum.nonzero().numpy().squeeze()
|
||||
ix_w_rel = rel_sum.nonzero(as_tuple=False).numpy().squeeze()
|
||||
|
||||
# label = (((gt_rel_label.sum(1) == 0).int() + (gt_rel_label.sum(0) == 0).int()) == 2)
|
||||
# change_ix = label.nonzero()
|
||||
# change_ix = label.nonzero(as_tuple=False)
|
||||
|
||||
gt_boxes = gt_boxes.numpy()
|
||||
num_gt_boxes = gt_boxes.shape[0]
|
||||
gt_relations = gt_rels.nonzero().numpy()
|
||||
gt_relations = gt_rels.nonzero(as_tuple=False).numpy()
|
||||
gt_classes = gt_classes.view(-1, 1).numpy()
|
||||
|
||||
gt_rels_view = gt_rels.contiguous().view(-1)
|
||||
gt_pred_labels = gt_rels_view[gt_rels_view.nonzero().squeeze()].contiguous().view(-1, 1).numpy()
|
||||
gt_pred_labels = gt_rels_view[gt_rels_view.nonzero(as_tuple=False).squeeze()].contiguous().view(-1, 1).numpy()
|
||||
|
||||
num_gt_relations = gt_relations.shape[0]
|
||||
if num_gt_relations == 0:
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
import torch
|
||||
from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
|
||||
|
||||
|
||||
# inspired from Detectron
|
||||
def evaluate_box_proposals(
|
||||
predictions, dataset, thresholds=None, area="all", limit=None
|
||||
):
|
||||
"""Evaluate detection proposal recall metrics. This function is a much
|
||||
faster alternative to the official COCO API recall evaluation code. However,
|
||||
it produces slightly different results.
|
||||
"""
|
||||
# Record max overlap value for each gt box
|
||||
# Return vector of overlap values
|
||||
areas = {
|
||||
"all": 0,
|
||||
"small": 1,
|
||||
"medium": 2,
|
||||
"large": 3,
|
||||
"96-128": 4,
|
||||
"128-256": 5,
|
||||
"256-512": 6,
|
||||
"512-inf": 7,
|
||||
}
|
||||
area_ranges = [
|
||||
[0 ** 2, 1e5 ** 2], # all
|
||||
[0 ** 2, 32 ** 2], # small
|
||||
[32 ** 2, 96 ** 2], # medium
|
||||
[96 ** 2, 1e5 ** 2], # large
|
||||
[96 ** 2, 128 ** 2], # 96-128
|
||||
[128 ** 2, 256 ** 2], # 128-256
|
||||
[256 ** 2, 512 ** 2], # 256-512
|
||||
[512 ** 2, 1e5 ** 2],
|
||||
] # 512-inf
|
||||
assert area in areas, "Unknown area range: {}".format(area)
|
||||
area_range = area_ranges[areas[area]]
|
||||
gt_overlaps = []
|
||||
num_pos = 0
|
||||
|
||||
for image_id, prediction in sorted(predictions.items()):
|
||||
img_info = dataset.get_img_info(image_id)
|
||||
image_width = img_info["width"]
|
||||
image_height = img_info["height"]
|
||||
prediction = prediction.resize((image_width, image_height))
|
||||
|
||||
# deal with ground truth
|
||||
gt_boxes = dataset.get_groundtruth(image_id)
|
||||
# filter out the field "relations"
|
||||
gt_boxes = gt_boxes.copy_with_fields(['attributes', 'labels'])
|
||||
gt_areas = gt_boxes.area()
|
||||
|
||||
if len(gt_boxes) == 0:
|
||||
continue
|
||||
|
||||
valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
|
||||
gt_boxes = gt_boxes[valid_gt_inds]
|
||||
|
||||
num_pos += len(gt_boxes)
|
||||
|
||||
if len(gt_boxes) == 0:
|
||||
continue
|
||||
|
||||
# sort predictions in descending order
|
||||
# TODO maybe remove this and make it explicit in the documentation
|
||||
_gt_overlaps = torch.zeros(len(gt_boxes))
|
||||
if len(prediction) == 0:
|
||||
gt_overlaps.append(_gt_overlaps)
|
||||
continue
|
||||
if "objectness" in prediction.extra_fields:
|
||||
inds = prediction.get_field("objectness").sort(descending=True)[1]
|
||||
elif "scores" in prediction.extra_fields:
|
||||
inds = prediction.get_field("scores").sort(descending=True)[1]
|
||||
else:
|
||||
raise ValueError("Neither objectness nor scores is in the extra_fields!")
|
||||
prediction = prediction[inds]
|
||||
|
||||
if limit is not None and len(prediction) > limit:
|
||||
prediction = prediction[:limit]
|
||||
|
||||
overlaps = boxlist_iou(prediction, gt_boxes)
|
||||
|
||||
for j in range(min(len(prediction), len(gt_boxes))):
|
||||
# find which proposal box maximally covers each gt box
|
||||
# and get the iou amount of coverage for each gt box
|
||||
max_overlaps, argmax_overlaps = overlaps.max(dim=0)
|
||||
|
||||
# find which gt box is 'best' covered (i.e. 'best' = most iou)
|
||||
gt_ovr, gt_ind = max_overlaps.max(dim=0)
|
||||
assert gt_ovr >= 0
|
||||
# find the proposal box that covers the best covered gt box
|
||||
box_ind = argmax_overlaps[gt_ind]
|
||||
# record the iou coverage of this gt box
|
||||
_gt_overlaps[j] = overlaps[box_ind, gt_ind]
|
||||
assert _gt_overlaps[j] == gt_ovr
|
||||
# mark the proposal box and the gt box as used
|
||||
overlaps[box_ind, :] = -1
|
||||
overlaps[:, gt_ind] = -1
|
||||
|
||||
# append recorded iou coverage level
|
||||
gt_overlaps.append(_gt_overlaps)
|
||||
gt_overlaps = torch.cat(gt_overlaps, dim=0)
|
||||
gt_overlaps, _ = torch.sort(gt_overlaps)
|
||||
|
||||
if thresholds is None:
|
||||
step = 0.05
|
||||
thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
|
||||
recalls = torch.zeros_like(thresholds)
|
||||
# compute recall for each iou threshold
|
||||
for i, t in enumerate(thresholds):
|
||||
recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
|
||||
# ar = 2 * np.trapz(recalls, thresholds)
|
||||
ar = recalls.mean()
|
||||
return {
|
||||
"ar": ar,
|
||||
"recalls": recalls,
|
||||
"thresholds": thresholds,
|
||||
"gt_overlaps": gt_overlaps,
|
||||
"num_pos": num_pos,
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
import logging
|
||||
|
||||
from .vg_eval import do_vg_evaluation
|
||||
|
||||
|
||||
def vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes, **_):
|
||||
logger = logging.getLogger("maskrcnn_benchmark.inference")
|
||||
logger.info("performing vg evaluation, ignored iou_types.")
|
||||
return do_vg_evaluation(
|
||||
dataset=dataset,
|
||||
predictions=predictions,
|
||||
output_folder=output_folder,
|
||||
box_only=box_only,
|
||||
eval_attributes=eval_attributes,
|
||||
logger=logger,
|
||||
)
|
|
@ -0,0 +1,391 @@
|
|||
# A modification version from chainercv repository.
|
||||
# (See https://github.com/chainer/chainercv/blob/master/chainercv/evaluations/eval_detection_voc.py)
|
||||
from __future__ import division
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import torch
|
||||
from maskrcnn_benchmark.structures.bounding_box import BoxList
|
||||
from maskrcnn_benchmark.data.datasets.evaluation.utils import evaluate_box_proposals
|
||||
|
||||
|
||||
def do_vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes, logger, save_predictions=True):
|
||||
# TODO need to make the use_07_metric format available
|
||||
# for the user to choose
|
||||
# we use int for box_only. 0: False, 1: box for RPN, 2: box for object detection,
|
||||
if box_only:
|
||||
if box_only == 1:
|
||||
limits = [100, 1000]
|
||||
elif box_only == 2:
|
||||
limits = [36, 99]
|
||||
else:
|
||||
raise ValueError("box_only can be either 0/1/2, but get {0}".format(box_only))
|
||||
areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
|
||||
result = {}
|
||||
for area, suffix in areas.items():
|
||||
for limit in limits:
|
||||
logger.info("Evaluating bbox proposals@{:d}".format(limit))
|
||||
stats = evaluate_box_proposals(
|
||||
predictions, dataset, area=area, limit=limit
|
||||
)
|
||||
key_ar = "AR{}@{:d}".format(suffix, limit)
|
||||
key_num_pos = "num_pos{}@{:d}".format(suffix, limit)
|
||||
result[key_num_pos] = stats["num_pos"]
|
||||
result[key_ar] = stats["ar"].item()
|
||||
key_recalls = "Recalls{}@{:d}".format(suffix, limit)
|
||||
# result[key_recalls] = stats["recalls"]
|
||||
print(key_recalls, stats["recalls"])
|
||||
print(key_ar, "ar={:.4f}".format(result[key_ar]))
|
||||
print(key_num_pos, "num_pos={:d}".format(result[key_num_pos]))
|
||||
logger.info(result)
|
||||
logger.info(result)
|
||||
# check_expected_results(result, expected_results, expected_results_sigma_tol)
|
||||
if output_folder and save_predictions:
|
||||
if box_only == 1:
|
||||
torch.save(result, os.path.join(output_folder, "rpn_proposals.pth"))
|
||||
elif box_only == 2:
|
||||
torch.save(result, os.path.join(output_folder, "box_proposals.pth"))
|
||||
else:
|
||||
raise ValueError("box_only can be either 0/1/2, but get {0}".format(box_only))
|
||||
return {"box_proposal": result}
|
||||
|
||||
pred_boxlists = []
|
||||
gt_boxlists = []
|
||||
for image_id, prediction in sorted(predictions.items()):
|
||||
img_info = dataset.get_img_info(image_id)
|
||||
if len(prediction) == 0:
|
||||
continue
|
||||
image_width = img_info["width"]
|
||||
image_height = img_info["height"]
|
||||
prediction = prediction.resize((image_width, image_height))
|
||||
pred_boxlists.append(prediction)
|
||||
|
||||
gt_boxlist = dataset.get_groundtruth(image_id)
|
||||
gt_boxlists.append(gt_boxlist)
|
||||
if eval_attributes:
|
||||
classes = dataset.attributes
|
||||
else:
|
||||
classes = dataset.classes
|
||||
result = eval_detection_voc(
|
||||
pred_boxlists=pred_boxlists,
|
||||
gt_boxlists=gt_boxlists,
|
||||
classes=classes,
|
||||
iou_thresh=0.5,
|
||||
eval_attributes=eval_attributes,
|
||||
use_07_metric=False,
|
||||
)
|
||||
result_str = "mAP: {:.4f}\n".format(result["map"])
|
||||
for i, ap in enumerate(result["ap"]):
|
||||
# if i == 0: # skip background
|
||||
# continue
|
||||
# we skipped background in result['ap'], so we need to use i+1
|
||||
if eval_attributes:
|
||||
result_str += "{:<16}: {:.4f}\n".format(
|
||||
dataset.map_attribute_id_to_attribute_name(i+1), ap
|
||||
)
|
||||
else:
|
||||
result_str += "{:<16}: {:.4f}\n".format(
|
||||
dataset.map_class_id_to_class_name(i+1), ap
|
||||
)
|
||||
logger.info(result_str)
|
||||
# return mAP and weighted mAP
|
||||
if eval_attributes:
|
||||
if output_folder and save_predictions:
|
||||
with open(os.path.join(output_folder, "result_attr.txt"), "w") as fid:
|
||||
fid.write(result_str)
|
||||
return {"attr": {"map": result["map"], "weighted map": result["weighted map"]}}
|
||||
else:
|
||||
if output_folder and save_predictions:
|
||||
with open(os.path.join(output_folder, "result_obj.txt"), "w") as fid:
|
||||
fid.write(result_str)
|
||||
return {"obj": {"map": result["map"], "weighted map": result["weighted map"]}}
|
||||
|
||||
|
||||
def eval_detection_voc(pred_boxlists, gt_boxlists, classes, iou_thresh=0.5, eval_attributes=False, use_07_metric=False):
|
||||
"""Evaluate on voc dataset.
|
||||
Args:
|
||||
pred_boxlists(list[BoxList]): pred boxlist, has labels and scores fields.
|
||||
gt_boxlists(list[BoxList]): ground truth boxlist, has labels field.
|
||||
iou_thresh: iou thresh
|
||||
use_07_metric: boolean
|
||||
Returns:
|
||||
dict represents the results
|
||||
"""
|
||||
assert len(gt_boxlists) == len(
|
||||
pred_boxlists
|
||||
), "Length of gt and pred lists need to be same."
|
||||
|
||||
aps = []
|
||||
nposs = []
|
||||
thresh = []
|
||||
|
||||
for i, classname in enumerate(classes):
|
||||
if classname == "__background__" or classname == "__no_attribute__":
|
||||
continue
|
||||
rec, prec, ap, scores, npos = calc_detection_voc_prec_rec(pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, \
|
||||
classindex=i, iou_thresh=iou_thresh,
|
||||
eval_attributes=eval_attributes,
|
||||
use_07_metric=use_07_metric)
|
||||
# Determine per class detection thresholds that maximise f score
|
||||
# if npos > 1:
|
||||
if npos > 1 and type(scores) != np.int:
|
||||
f = np.nan_to_num((prec * rec) / (prec + rec))
|
||||
thresh += [scores[np.argmax(f)]]
|
||||
else:
|
||||
thresh += [0]
|
||||
aps += [ap]
|
||||
nposs += [float(npos)]
|
||||
print('AP for {} = {:.4f} (npos={:,})'.format(classname, ap, npos))
|
||||
# if pickle:
|
||||
# with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
|
||||
# cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap,
|
||||
# 'scores': scores, 'npos':npos}, f)
|
||||
|
||||
# Set thresh to mean for classes with poor results
|
||||
thresh = np.array(thresh)
|
||||
avg_thresh = np.mean(thresh[thresh != 0])
|
||||
thresh[thresh == 0] = avg_thresh
|
||||
# if eval_attributes:
|
||||
# filename = 'attribute_thresholds_' + self._image_set + '.txt'
|
||||
# else:
|
||||
# filename = 'object_thresholds_' + self._image_set + '.txt'
|
||||
# path = os.path.join(output_dir, filename)
|
||||
# with open(path, 'wt') as f:
|
||||
# for i, cls in enumerate(classes[1:]):
|
||||
# f.write('{:s} {:.3f}\n'.format(cls, thresh[i]))
|
||||
|
||||
weights = np.array(nposs)
|
||||
weights /= weights.sum()
|
||||
print('Mean AP = {:.4f}'.format(np.mean(aps)))
|
||||
print('Weighted Mean AP = {:.4f}'.format(np.average(aps, weights=weights)))
|
||||
print('Mean Detection Threshold = {:.3f}'.format(avg_thresh))
|
||||
print('~~~~~~~~')
|
||||
print('Results:')
|
||||
for ap, npos in zip(aps, nposs):
|
||||
print('{:.3f}\t{:.3f}'.format(ap, npos))
|
||||
print('{:.3f}'.format(np.mean(aps)))
|
||||
print('~~~~~~~~')
|
||||
print('')
|
||||
print('--------------------------------------------------------------')
|
||||
print('Results computed with the **unofficial** PASCAL VOC Python eval code.')
|
||||
print('--------------------------------------------------------------')
|
||||
|
||||
# pdb.set_trace()
|
||||
return {"ap": aps, "map": np.mean(aps), "weighted map": np.average(aps, weights=weights)}
|
||||
|
||||
|
||||
def calc_detection_voc_prec_rec(pred_boxlists, gt_boxlists, classindex, iou_thresh=0.5, eval_attributes=False,
|
||||
use_07_metric=False):
|
||||
"""Calculate precision and recall based on evaluation code of PASCAL VOC.
|
||||
This function calculates precision and recall of
|
||||
predicted bounding boxes obtained from a dataset which has :math:`N`
|
||||
images.
|
||||
The code is based on the evaluation code used in PASCAL VOC Challenge.
|
||||
"""
|
||||
class_recs = {}
|
||||
npos = 0
|
||||
image_ids = []
|
||||
confidence = []
|
||||
BB = []
|
||||
for image_index, (gt_boxlist, pred_boxlist) in enumerate(zip(gt_boxlists, pred_boxlists)):
|
||||
pred_bbox = pred_boxlist.bbox.numpy()
|
||||
gt_bbox = gt_boxlist.bbox.numpy()
|
||||
if eval_attributes:
|
||||
gt_label = gt_boxlist.get_field("attributes").numpy()
|
||||
pred_label = pred_boxlist.get_field("attr_labels").numpy()
|
||||
pred_score = pred_boxlist.get_field("attr_scores").numpy()
|
||||
else:
|
||||
gt_label = gt_boxlist.get_field("labels").numpy()
|
||||
pred_label = pred_boxlist.get_field("labels").numpy()
|
||||
pred_score = pred_boxlist.get_field("scores").numpy()
|
||||
|
||||
# get the ground truth information for this class
|
||||
if eval_attributes:
|
||||
gt_mask_l = np.array([classindex in i for i in gt_label])
|
||||
else:
|
||||
gt_mask_l = gt_label == classindex
|
||||
gt_bbox_l = gt_bbox[gt_mask_l]
|
||||
gt_difficult_l = np.zeros(gt_bbox_l.shape[0], dtype=bool)
|
||||
det = [False] * gt_bbox_l.shape[0]
|
||||
npos = npos + sum(~gt_difficult_l)
|
||||
class_recs[image_index] = {'bbox': gt_bbox_l,
|
||||
'difficult': gt_difficult_l,
|
||||
'det': det}
|
||||
|
||||
# prediction output for each class
|
||||
# pdb.set_trace()
|
||||
if eval_attributes:
|
||||
pred_mask_l = np.logical_and(pred_label == classindex, np.not_equal(pred_score, 0.0)).nonzero()
|
||||
pred_bbox_l = pred_bbox[pred_mask_l[0]]
|
||||
pred_score_l = pred_score[pred_mask_l]
|
||||
else:
|
||||
pred_mask_l = pred_label == classindex
|
||||
pred_bbox_l = pred_bbox[pred_mask_l]
|
||||
pred_score_l = pred_score[pred_mask_l]
|
||||
|
||||
for bbox_tmp, score_tmp in zip(pred_bbox_l, pred_score_l):
|
||||
image_ids.append(image_index)
|
||||
confidence.append(float(score_tmp))
|
||||
BB.append([float(z) for z in bbox_tmp])
|
||||
|
||||
if npos == 0:
|
||||
# No ground truth examples
|
||||
return 0, 0, 0, 0, npos
|
||||
|
||||
if len(confidence) == 0:
|
||||
# No detection examples
|
||||
return 0, 0, 0, 0, npos
|
||||
|
||||
confidence = np.array(confidence)
|
||||
BB = np.array(BB)
|
||||
|
||||
# sort by confidence
|
||||
sorted_ind = np.argsort(-confidence)
|
||||
sorted_scores = -np.sort(-confidence)
|
||||
BB = BB[sorted_ind, :]
|
||||
image_ids = [image_ids[x] for x in sorted_ind]
|
||||
|
||||
# go down dets and mark TPs and FPs
|
||||
nd = len(image_ids)
|
||||
tp = np.zeros(nd)
|
||||
fp = np.zeros(nd)
|
||||
|
||||
for d in range(nd):
|
||||
R = class_recs[image_ids[d]]
|
||||
bb = BB[d, :].astype(float)
|
||||
ovmax = -np.inf
|
||||
BBGT = R['bbox'].astype(float)
|
||||
|
||||
if BBGT.size > 0:
|
||||
# compute overlaps
|
||||
# intersection
|
||||
ixmin = np.maximum(BBGT[:, 0], bb[0])
|
||||
iymin = np.maximum(BBGT[:, 1], bb[1])
|
||||
ixmax = np.minimum(BBGT[:, 2], bb[2])
|
||||
iymax = np.minimum(BBGT[:, 3], bb[3])
|
||||
iw = np.maximum(ixmax - ixmin + 1., 0.)
|
||||
ih = np.maximum(iymax - iymin + 1., 0.)
|
||||
inters = iw * ih
|
||||
|
||||
# union
|
||||
uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
|
||||
(BBGT[:, 2] - BBGT[:, 0] + 1.) *
|
||||
(BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
|
||||
|
||||
overlaps = inters / uni
|
||||
ovmax = np.max(overlaps)
|
||||
jmax = np.argmax(overlaps)
|
||||
|
||||
if ovmax > iou_thresh:
|
||||
if not R['difficult'][jmax]:
|
||||
if not R['det'][jmax]:
|
||||
tp[d] = 1.
|
||||
R['det'][jmax] = 1
|
||||
else:
|
||||
fp[d] = 1.
|
||||
else:
|
||||
fp[d] = 1.
|
||||
|
||||
# compute precision recall
|
||||
fp = np.cumsum(fp)
|
||||
tp = np.cumsum(tp)
|
||||
rec = tp / float(npos)
|
||||
# avoid divide by zero in case the first detection matches a difficult
|
||||
# ground truth
|
||||
prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
|
||||
ap = voc_ap(rec, prec, use_07_metric)
|
||||
|
||||
return rec, prec, ap, sorted_scores, npos
|
||||
|
||||
|
||||
def voc_ap(rec, prec, use_07_metric=False):
|
||||
""" ap = voc_ap(rec, prec, [use_07_metric])
|
||||
Compute VOC AP given precision and recall.
|
||||
If use_07_metric is true, uses the
|
||||
VOC 07 11 point method (default:False).
|
||||
"""
|
||||
if use_07_metric:
|
||||
# 11 point metric
|
||||
ap = 0.
|
||||
for t in np.arange(0., 1.1, 0.1):
|
||||
if np.sum(rec >= t) == 0:
|
||||
p = 0
|
||||
else:
|
||||
p = np.max(prec[rec >= t])
|
||||
ap = ap + p / 11.
|
||||
else:
|
||||
# correct AP calculation
|
||||
# first append sentinel values at the end
|
||||
mrec = np.concatenate(([0.], rec, [1.]))
|
||||
mpre = np.concatenate(([0.], prec, [0.]))
|
||||
|
||||
# compute the precision envelope
|
||||
for i in range(mpre.size - 1, 0, -1):
|
||||
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
|
||||
|
||||
# to calculate area under PR curve, look for points
|
||||
# where X axis (recall) changes value
|
||||
i = np.where(mrec[1:] != mrec[:-1])[0]
|
||||
|
||||
# and sum (\Delta recall) * prec
|
||||
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
|
||||
return ap
|
||||
|
||||
|
||||
def calc_detection_voc_ap(prec, rec, use_07_metric=False):
|
||||
"""Calculate average precisions based on evaluation code of PASCAL VOC.
|
||||
This function calculates average precisions
|
||||
from given precisions and recalls.
|
||||
The code is based on the evaluation code used in PASCAL VOC Challenge.
|
||||
Args:
|
||||
prec (list of numpy.array): A list of arrays.
|
||||
:obj:`prec[l]` indicates precision for class :math:`l`.
|
||||
If :obj:`prec[l]` is :obj:`None`, this function returns
|
||||
:obj:`numpy.nan` for class :math:`l`.
|
||||
rec (list of numpy.array): A list of arrays.
|
||||
:obj:`rec[l]` indicates recall for class :math:`l`.
|
||||
If :obj:`rec[l]` is :obj:`None`, this function returns
|
||||
:obj:`numpy.nan` for class :math:`l`.
|
||||
use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
|
||||
for calculating average precision. The default value is
|
||||
:obj:`False`.
|
||||
Returns:
|
||||
~numpy.ndarray:
|
||||
This function returns an array of average precisions.
|
||||
The :math:`l`-th value corresponds to the average precision
|
||||
for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
|
||||
:obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
|
||||
"""
|
||||
|
||||
n_fg_class = len(prec)
|
||||
ap = np.empty(n_fg_class)
|
||||
for l in range(n_fg_class):
|
||||
if prec[l] is None or rec[l] is None:
|
||||
ap[l] = np.nan
|
||||
continue
|
||||
|
||||
if use_07_metric:
|
||||
# 11 point metric
|
||||
ap[l] = 0
|
||||
for t in np.arange(0.0, 1.1, 0.1):
|
||||
if np.sum(rec[l] >= t) == 0:
|
||||
p = 0
|
||||
else:
|
||||
p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
|
||||
ap[l] += p / 11
|
||||
else:
|
||||
# correct AP calculation
|
||||
# first append sentinel values at the end
|
||||
mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
|
||||
mrec = np.concatenate(([0], rec[l], [1]))
|
||||
|
||||
mpre = np.maximum.accumulate(mpre[::-1])[::-1]
|
||||
|
||||
# to calculate area under PR curve, look for points
|
||||
# where X axis (recall) changes value
|
||||
i = np.where(mrec[1:] != mrec[:-1])[0]
|
||||
|
||||
# and sum (\Delta recall) * prec
|
||||
ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
|
||||
|
||||
return ap
|
|
@ -11,6 +11,11 @@ from .utils.label_loader import LabelLoader
|
|||
from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
|
||||
|
||||
|
||||
def sort_key_by_val(dic):
|
||||
sorted_dic = sorted(dic.items(), key=lambda kv: kv[1])
|
||||
return [kv[0] for kv in sorted_dic]
|
||||
|
||||
|
||||
class RelationTSVDataset(TSVYamlDataset):
|
||||
"""
|
||||
Generic TSV dataset format for Object Detection.
|
||||
|
@ -28,7 +33,7 @@ class RelationTSVDataset(TSVYamlDataset):
|
|||
self.contrastive_loss_on = kwargs['args'].MODEL.ROI_RELATION_HEAD.CONTRASTIVE_LOSS.USE_FLAG if kwargs['args'] is not None else False
|
||||
|
||||
# construct maps
|
||||
jsondict_file = find_file_path_in_yaml(self.cfg.get("labelmap", None), self.root)
|
||||
jsondict_file = find_file_path_in_yaml(self.cfg.get("labelmap", self.cfg.get("jsondict", None)), self.root) # previous version use jsondict
|
||||
jsondict = json.load(open(jsondict_file, 'r'))
|
||||
|
||||
self.labelmap = {}
|
||||
|
@ -37,18 +42,21 @@ class RelationTSVDataset(TSVYamlDataset):
|
|||
self.class_to_ind['__background__'] = 0
|
||||
self.ind_to_class = {v:k for k, v in self.class_to_ind.items()}
|
||||
self.labelmap['class_to_ind'] = self.class_to_ind
|
||||
self.classes = sort_key_by_val(self.class_to_ind)
|
||||
|
||||
if self.attribute_on:
|
||||
self.attribute_to_ind = jsondict['attribute_to_idx']
|
||||
self.attribute_to_ind['__no_attribute__'] = 0
|
||||
self.ind_to_attribute = {v:k for k, v in self.attribute_to_ind.items()}
|
||||
self.labelmap['attribute_to_ind'] = self.attribute_to_ind
|
||||
self.attributes = sort_key_by_val(self.attribute_to_ind)
|
||||
|
||||
if self.relation_on:
|
||||
self.relation_to_ind = jsondict['predicate_to_idx']
|
||||
self.relation_to_ind['__no_relation__'] = 0
|
||||
self.ind_to_relation = {v:k for k, v in self.relation_to_ind.items()}
|
||||
self.labelmap['relation_to_ind'] = self.relation_to_ind
|
||||
self.relations = sort_key_by_val(self.relation_to_ind)
|
||||
|
||||
if self.is_load_label or self.detector_pre_calculated:
|
||||
self.label_loader = LabelLoader(
|
||||
|
|
|
@ -237,7 +237,29 @@ def inference(
|
|||
total_timer = Timer()
|
||||
inference_timer = Timer()
|
||||
total_timer.tic()
|
||||
predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer)
|
||||
|
||||
output_pth_name = 'predictions_forcebox.pth' if eval_attributes else 'predictions.pth'
|
||||
if output_folder and os.path.isfile(os.path.join(output_folder, output_pth_name)):
|
||||
logger.info("Predictions.pth file exist in {}, skip computation".format(
|
||||
os.path.join(output_folder, output_pth_name)))
|
||||
if not is_main_process():
|
||||
return
|
||||
if cfg.TEST.SAVE_RESULTS_TO_TSV or not cfg.TEST.SKIP_PERFORMANCE_EVAL:
|
||||
predictions = torch.load(os.path.join(output_folder, output_pth_name))
|
||||
else:
|
||||
if eval_attributes:
|
||||
# change to force_boxes=True mode
|
||||
force_boxes_model = model.force_boxes
|
||||
force_boxes_box = model.roi_heads.box.post_processor.force_boxes
|
||||
model.force_boxes = True
|
||||
model.roi_heads.box.post_processor.force_boxes = True
|
||||
predictions = compute_on_dataset(model, data_loader, device, bbox_aug,
|
||||
inference_timer)
|
||||
# return to the original state
|
||||
model.force_boxes = force_boxes_model
|
||||
model.roi_heads.box.post_processor.force_boxes = force_boxes_box
|
||||
else:
|
||||
predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer)
|
||||
# wait for all processes to complete before measuring the time
|
||||
synchronize()
|
||||
total_time = total_timer.toc()
|
||||
|
@ -262,7 +284,7 @@ def inference(
|
|||
return
|
||||
|
||||
if output_folder and save_predictions:
|
||||
torch.save(predictions, os.path.join(output_folder, "predictions.pth"))
|
||||
torch.save(predictions, os.path.join(output_folder, output_pth_name))
|
||||
|
||||
if output_folder and cfg.TEST.SAVE_RESULTS_TO_TSV:
|
||||
logger.info("Convert prediction results to tsv format and save.")
|
||||
|
@ -281,11 +303,16 @@ def inference(
|
|||
|
||||
extra_args = dict(
|
||||
box_only=box_only,
|
||||
eval_attributes=eval_attributes,
|
||||
iou_types=iou_types,
|
||||
expected_results=expected_results,
|
||||
expected_results_sigma_tol=expected_results_sigma_tol,
|
||||
save_predictions=save_predictions
|
||||
)
|
||||
if hasattr(cfg.MODEL, 'RELATION_ON'):
|
||||
extra_args['sg_eval'] = cfg.MODEL.RELATION_ON
|
||||
else:
|
||||
extra_args['sg_eval'] = False
|
||||
|
||||
return evaluate(dataset=dataset,
|
||||
predictions=predictions,
|
||||
|
|
|
@ -13,8 +13,8 @@ from maskrcnn_benchmark.data import make_data_loader
|
|||
from maskrcnn_benchmark.utils.comm import get_world_size, synchronize
|
||||
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
|
||||
from maskrcnn_benchmark.engine.inference import inference
|
||||
from maskrcnn_benchmark.utils.amp import autocast, GradScaler
|
||||
|
||||
from apex import amp
|
||||
|
||||
def reduce_loss_dict(loss_dict):
|
||||
"""
|
||||
|
@ -63,6 +63,9 @@ def do_train(
|
|||
start_training_time = time.time()
|
||||
end = time.time()
|
||||
|
||||
if cfg.SOLVER.USE_AMP:
|
||||
scaler = GradScaler()
|
||||
|
||||
iou_types = ("bbox",)
|
||||
if cfg.MODEL.MASK_ON:
|
||||
iou_types = iou_types + ("segm",)
|
||||
|
@ -84,7 +87,11 @@ def do_train(
|
|||
images = images.to(device)
|
||||
# targets = [target.to(device) for target in targets]
|
||||
|
||||
loss_dict = model(images, targets)
|
||||
if cfg.SOLVER.USE_AMP:
|
||||
with autocast():
|
||||
loss_dict = model(images, targets)
|
||||
else:
|
||||
loss_dict = model(images, targets)
|
||||
|
||||
# take care of additional metric besides loss returned from model
|
||||
if type(loss_dict) == tuple:
|
||||
|
@ -101,12 +108,13 @@ def do_train(
|
|||
meters.update(loss=losses_reduced, **loss_dict_reduced)
|
||||
|
||||
optimizer.zero_grad()
|
||||
# # Note: If mixed precision is not used, this ends up doing nothing
|
||||
# # Otherwise apply loss scaling for mixed-precision recipe
|
||||
# with amp.scale_loss(losses, optimizer) as scaled_losses:
|
||||
# scaled_losses.backward()
|
||||
losses.backward()
|
||||
optimizer.step()
|
||||
if cfg.SOLVER.USE_AMP:
|
||||
scaler.scale(losses).backward()
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
else:
|
||||
losses.backward()
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
|
||||
batch_time = time.time() - end
|
||||
|
@ -116,7 +124,7 @@ def do_train(
|
|||
eta_seconds = meters.time.global_avg * (max_iter - iteration)
|
||||
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
|
||||
|
||||
if iteration % 1 == 0 or iteration == max_iter:
|
||||
if iteration % cfg.LOG_LOSS_PERIOD == 0 or iteration == max_iter:
|
||||
logger.info(
|
||||
meters.delimiter.join(
|
||||
[
|
||||
|
|
|
@ -2,6 +2,9 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
import torch.distributed as dist
|
||||
import maskrcnn_benchmark.utils.comm as comm
|
||||
from torch.autograd.function import Function
|
||||
|
||||
class FrozenBatchNorm2d(nn.Module):
|
||||
"""
|
||||
|
@ -17,15 +20,98 @@ class FrozenBatchNorm2d(nn.Module):
|
|||
self.register_buffer("running_var", torch.ones(n))
|
||||
|
||||
def forward(self, x):
|
||||
# Cast all fixed parameters to half() if necessary
|
||||
if x.dtype == torch.float16:
|
||||
self.weight = self.weight.half()
|
||||
self.bias = self.bias.half()
|
||||
self.running_mean = self.running_mean.half()
|
||||
self.running_var = self.running_var.half()
|
||||
|
||||
scale = self.weight * self.running_var.rsqrt()
|
||||
bias = self.bias - self.running_mean * scale
|
||||
scale = scale.reshape(1, -1, 1, 1)
|
||||
bias = bias.reshape(1, -1, 1, 1)
|
||||
return x * scale + bias
|
||||
|
||||
|
||||
class AllReduce(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
|
||||
# Use allgather instead of allreduce since I don't trust in-place operations ..
|
||||
dist.all_gather(input_list, input, async_op=False)
|
||||
inputs = torch.stack(input_list, dim=0)
|
||||
return torch.sum(inputs, dim=0)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
dist.all_reduce(grad_output, async_op=False)
|
||||
return grad_output
|
||||
|
||||
|
||||
class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
|
||||
"""
|
||||
In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
|
||||
when the batch size on each worker is different.
|
||||
(e.g., when scale augmentation is used, or when it is applied to mask head).
|
||||
|
||||
This is a slower but correct alternative to `nn.SyncBatchNorm`.
|
||||
|
||||
Note:
|
||||
There isn't a single definition of Sync BatchNorm.
|
||||
|
||||
When ``stats_mode==""``, this module computes overall statistics by using
|
||||
statistics of each worker with equal weight. The result is true statistics
|
||||
of all samples (as if they are all on one worker) only when all workers
|
||||
have the same (N, H, W). This mode does not support inputs with zero batch size.
|
||||
|
||||
When ``stats_mode=="N"``, this module computes overall statistics by weighting
|
||||
the statistics of each worker by their ``N``. The result is true statistics
|
||||
of all samples (as if they are all on one worker) only when all workers
|
||||
have the same (H, W). It is slower than ``stats_mode==""``.
|
||||
|
||||
Even though the result of this module may not be the true statistics of all samples,
|
||||
it may still be reasonable because it might be preferrable to assign equal weights
|
||||
to all workers, regardless of their (H, W) dimension, instead of putting larger weight
|
||||
on larger images. From preliminary experiments, little difference is found between such
|
||||
a simplified implementation and an accurate computation of overall mean & variance.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, stats_mode="", **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert stats_mode in ["", "N"]
|
||||
self._stats_mode = stats_mode
|
||||
|
||||
def forward(self, input):
|
||||
if comm.get_world_size() == 1 or not self.training:
|
||||
return super().forward(input)
|
||||
|
||||
B, C = input.shape[0], input.shape[1]
|
||||
|
||||
mean = torch.mean(input, dim=[0, 2, 3])
|
||||
meansqr = torch.mean(input * input, dim=[0, 2, 3])
|
||||
|
||||
if self._stats_mode == "":
|
||||
assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
|
||||
vec = torch.cat([mean, meansqr], dim=0)
|
||||
vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
|
||||
mean, meansqr = torch.split(vec, C)
|
||||
momentum = self.momentum
|
||||
else:
|
||||
if B == 0:
|
||||
vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
|
||||
vec = vec + input.sum() # make sure there is gradient w.r.t input
|
||||
else:
|
||||
vec = torch.cat(
|
||||
[mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0
|
||||
)
|
||||
vec = AllReduce.apply(vec * B)
|
||||
|
||||
total_batch = vec[-1].detach()
|
||||
momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0
|
||||
total_batch = torch.max(total_batch, torch.ones_like(total_batch)) # avoid div-by-zero
|
||||
mean, meansqr, _ = torch.split(vec / total_batch, C)
|
||||
|
||||
var = meansqr - mean * mean
|
||||
invstd = torch.rsqrt(var + self.eps)
|
||||
scale = self.weight * invstd
|
||||
bias = self.bias - mean * scale
|
||||
scale = scale.reshape(1, -1, 1, 1)
|
||||
bias = bias.reshape(1, -1, 1, 1)
|
||||
|
||||
self.running_mean += momentum * (mean.detach() - self.running_mean)
|
||||
self.running_var += momentum * (var.detach() - self.running_var)
|
||||
return input * scale + bias
|
|
@ -1,12 +1,12 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
# from ._utils import _C
|
||||
from maskrcnn_benchmark import _C
|
||||
|
||||
# from apex import amp
|
||||
try:
|
||||
import torchvision
|
||||
from torchvision.ops import nms
|
||||
except:
|
||||
nms = _C.nms
|
||||
|
||||
# Only valid with fp32 inputs - give AMP the hint
|
||||
# nms = amp.float_function(_C.nms)
|
||||
nms = _C.nms
|
||||
|
||||
# nms.__doc__ = """
|
||||
# This function performs Non-maximum suppresion"""
|
||||
|
|
|
@ -7,8 +7,6 @@ from torch.nn.modules.utils import _pair
|
|||
|
||||
from maskrcnn_benchmark import _C
|
||||
|
||||
# from apex import amp
|
||||
|
||||
class _ROIAlign(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
|
||||
|
@ -44,8 +42,11 @@ class _ROIAlign(Function):
|
|||
)
|
||||
return grad_input, None, None, None, None
|
||||
|
||||
|
||||
roi_align = _ROIAlign.apply
|
||||
try:
|
||||
import torchvision
|
||||
from torchvision.ops import roi_align
|
||||
except:
|
||||
roi_align = _ROIAlign.apply
|
||||
|
||||
class ROIAlign(nn.Module):
|
||||
def __init__(self, output_size, spatial_scale, sampling_ratio):
|
||||
|
@ -54,7 +55,6 @@ class ROIAlign(nn.Module):
|
|||
self.spatial_scale = spatial_scale
|
||||
self.sampling_ratio = sampling_ratio
|
||||
|
||||
# @amp.float_function
|
||||
def forward(self, input, rois):
|
||||
return roi_align(
|
||||
input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
|
||||
|
|
|
@ -7,7 +7,6 @@ from torch.nn.modules.utils import _pair
|
|||
|
||||
from maskrcnn_benchmark import _C
|
||||
|
||||
# from apex import amp
|
||||
|
||||
class _ROIPool(Function):
|
||||
@staticmethod
|
||||
|
@ -53,7 +52,6 @@ class ROIPool(nn.Module):
|
|||
self.output_size = output_size
|
||||
self.spatial_scale = spatial_scale
|
||||
|
||||
# @amp.float_function
|
||||
def forward(self, input, rois):
|
||||
return roi_pool(input, rois, self.output_size, self.spatial_scale)
|
||||
|
||||
|
|
|
@ -57,7 +57,6 @@ class SigmoidFocalLoss(nn.Module):
|
|||
self.alpha = alpha
|
||||
|
||||
def forward(self, logits, targets):
|
||||
device = logits.device
|
||||
if logits.is_cuda:
|
||||
loss_func = sigmoid_focal_loss_cuda
|
||||
else:
|
||||
|
|
|
@ -8,6 +8,7 @@ from maskrcnn_benchmark.modeling import registry
|
|||
from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
|
||||
from . import fpn as fpn_module
|
||||
from . import resnet
|
||||
from .msvit import build_msvit_backbone
|
||||
|
||||
|
||||
@registry.BACKBONES.register("R-50-C4")
|
||||
|
@ -73,6 +74,15 @@ def build_resnet_fpn_p3p7_backbone(cfg):
|
|||
return model
|
||||
|
||||
|
||||
@registry.BACKBONES.register("ViL-C4")
|
||||
def build_vilc4_backbone(cfg):
|
||||
assert len(cfg.MODEL.TRANSFORMER.OUT_FEATURES) == 1, "The number of OUT_FEATURES in ViL-C4 is not 1!"
|
||||
body = build_msvit_backbone(cfg)
|
||||
model = nn.Sequential(OrderedDict([("body", body)]))
|
||||
model.out_channels = body.out_planes
|
||||
return model
|
||||
|
||||
|
||||
def build_backbone(cfg):
|
||||
assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
|
||||
"cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
|
||||
|
|
|
@ -0,0 +1,286 @@
|
|||
# Copyright (c) 2021 Microsoft Corporation. Licensed under the MIT license.
|
||||
# Written by Pengchuan Zhang, penzhan@microsoft.com
|
||||
import random
|
||||
import torch
|
||||
from torch import nn, einsum
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange
|
||||
from timm.models.layers import trunc_normal_
|
||||
from .slidingchunk_2d import slidingchunk_2d, mask_invalid_locations, slidingchunk_2dautograd
|
||||
|
||||
|
||||
class Long2DSCSelfAttention(nn.Module):
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., w=7, d=1,
|
||||
autoregressive=False, sharew=False, nglo=1, only_glo=False, exact=0, autograd=False, rpe=False,
|
||||
mode=0):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.scale = qk_scale or self.head_dim ** -0.5
|
||||
self.Nglo = nglo
|
||||
self.only_glo = only_glo
|
||||
if self.only_glo:
|
||||
assert self.Nglo >= 1, "Nglo == 0 in the only global mode!"
|
||||
|
||||
self.query = nn.Linear(dim, dim, bias=qkv_bias)
|
||||
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
|
||||
if nglo >= 1:
|
||||
if sharew:
|
||||
self.query_global = self.query
|
||||
self.kv_global = self.kv
|
||||
self.proj_global = self.proj
|
||||
else:
|
||||
self.query_global = nn.Linear(dim, dim, bias=qkv_bias)
|
||||
self.kv_global = nn.Linear(dim, dim * 2, bias=qkv_bias)
|
||||
self.proj_global = nn.Linear(dim, dim)
|
||||
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
self.attention_window = w
|
||||
self.attention_dilation = d
|
||||
self.autoregressive = autoregressive
|
||||
|
||||
assert self.attention_dilation == 1, "Dilation is not supported!"
|
||||
assert not self.autoregressive, "Autoregressive is not supported yet!"
|
||||
self.exact = exact
|
||||
# use autograd or handgrad
|
||||
self.longform2d_mm = slidingchunk_2dautograd if autograd else slidingchunk_2d
|
||||
|
||||
# Inspired by swin transformer:
|
||||
# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L88-L103
|
||||
# define parameter tables for local and global relative position bias
|
||||
self.rpe = rpe
|
||||
if rpe:
|
||||
self.local_relative_position_bias_table = nn.Parameter(
|
||||
torch.zeros((2 * 2 * w - 1) * (2 * 2 * w - 1), num_heads)) # (4*w-1, 4*w-1, nH)
|
||||
trunc_normal_(self.local_relative_position_bias_table, std=.02)
|
||||
if nglo >= 1:
|
||||
self.g2l_relative_position_bias = nn.Parameter(
|
||||
torch.zeros(2, num_heads, nglo)) # (2, nH, nglo)
|
||||
self.g2g_relative_position_bias = nn.Parameter(
|
||||
torch.zeros(num_heads, nglo, nglo)) # (nH, nglo, nglo)
|
||||
trunc_normal_(self.g2l_relative_position_bias, std=.02)
|
||||
trunc_normal_(self.g2g_relative_position_bias, std=.02)
|
||||
|
||||
# get pair-wise relative position index
|
||||
coords_h = torch.arange(-w, 2*w)
|
||||
coords_w = torch.arange(-w, 2*w)
|
||||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, 3w, 3w
|
||||
coords_unfold = rearrange(
|
||||
coords, 'c (m x) (n y) -> c m n (x y)', x=w, y=w
|
||||
) # 2, 3, 3, 9w^2
|
||||
q_coords = coords_unfold[:, 1, 1, :] # 2, w^2
|
||||
relative_coords = torch.cat([
|
||||
# -1, -1
|
||||
q_coords[:, :, None] - coords_unfold[:, 0, 0, :][:, None, :],
|
||||
# -1, 0
|
||||
q_coords[:, :, None] - coords_unfold[:, 0, 1, :][:, None, :],
|
||||
# -1, 1
|
||||
q_coords[:, :, None] - coords_unfold[:, 0, 2, :][:, None, :],
|
||||
# 0,-1
|
||||
q_coords[:, :, None] - coords_unfold[:, 1, 0, :][:, None, :],
|
||||
# 0,0
|
||||
q_coords[:, :, None] - q_coords[:, None, :],
|
||||
# 0,1
|
||||
q_coords[:, :, None] - coords_unfold[:, 1, 2, :][:, None, :],
|
||||
# 1, -1
|
||||
q_coords[:, :, None] - coords_unfold[:, 2, 0, :][:, None, :],
|
||||
# 1, 0
|
||||
q_coords[:, :, None] - coords_unfold[:, 2, 1, :][:, None, :],
|
||||
# 1, 1
|
||||
q_coords[:, :, None] - coords_unfold[:, 2, 2, :][:, None, :],
|
||||
], dim=-1) # 2, w^2, 9w^2
|
||||
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # w^2, 9w^2, 2
|
||||
relative_coords[:, :, 0] += 2 * w - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += 2 * w - 1
|
||||
relative_coords[:, :, 0] *= 2 * 2 * w - 1
|
||||
relative_position_index = relative_coords.sum(-1) # w^2, 9w^2
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
|
||||
# mode to control the sampling strategy of neighbor blocks
|
||||
# 0: all 8 blocks; -1: no neighbor block; >0: random sample one block
|
||||
self.mode = mode
|
||||
|
||||
def forward(self, x, nx, ny):
|
||||
B, N, C = x.shape
|
||||
Nloc = nx * ny
|
||||
Nglo, H, M, W = self.Nglo, self.num_heads, self.head_dim, self.attention_window
|
||||
W2 = W ** 2
|
||||
assert Nglo + Nloc == N, "Global dimension does not match!"
|
||||
|
||||
# get the mode of the longformer attention
|
||||
mode = self.mode
|
||||
kv_nums = 9 * W2
|
||||
if self.mode > 0:
|
||||
if self.training:
|
||||
mode = random.randrange(1, 9) # 1 <= mode <= 8
|
||||
kv_nums = 2 * W2
|
||||
else:
|
||||
mode = 0 # full during evaluation
|
||||
elif mode == -1:
|
||||
kv_nums = W2
|
||||
|
||||
# compute the local attention
|
||||
q = self.scale * self.query(x[:, Nglo:]).reshape(B, Nloc, H, M).transpose(1, 2).contiguous()
|
||||
kv = self.kv(x).reshape(B, N, 2, H, M).permute(2, 0, 3, 1, 4)
|
||||
k, v = kv[0], kv[1] # make torchscript happy (cannot use tensor as tuple)
|
||||
|
||||
if self.only_glo:
|
||||
# local to global attn10: (B, self.num_heads, Nloc, Nglo)
|
||||
attn1 = torch.bmm(q.view(B*H, Nloc, M), k[:, :, :Nglo].reshape(B*H, Nglo, M).transpose(-2, -1)).view(B, H, Nloc, Nglo)
|
||||
else:
|
||||
(q_img, k_img, v_img) = map(
|
||||
lambda t: rearrange(t, 'b h (x y) c -> (b h) c x y', x=nx),
|
||||
(q, k[:, :, Nglo:], v[:, :, Nglo:]))
|
||||
# pad 0's to make sure that nx % W == 0, ny % W == 0
|
||||
(padx, pady) = map(lambda t: (W - t % W) % W, (nx, ny))
|
||||
(mx, my) = map(lambda t: (t[0] + t[1]) // W,
|
||||
((nx, padx), (ny, pady)))
|
||||
if padx > 0 or pady > 0:
|
||||
(q_img, k_img, v_img) = map(
|
||||
lambda t: F.pad(t, (0, pady, 0, padx)), (q_img, k_img, v_img)
|
||||
)
|
||||
# unfold the padded tensor
|
||||
(q_img, k_img, v_img) = map(
|
||||
lambda t: rearrange(t, 'b c (m x) (n y) -> b c m n (x y)', x=W, y=W),
|
||||
(q_img, k_img, v_img)
|
||||
)
|
||||
|
||||
# local to global attn10: (B*H, mx, my, w^2, Nglo)
|
||||
attn10 = einsum('b c m n l, b t c -> b m n l t', q_img,
|
||||
k[:, :, :Nglo].reshape(B*H, Nglo, M))
|
||||
# local to local attn11: (B*H, mx, my, W**2, 9*W**2), mode = 0
|
||||
# attn11: (B*H, mx, my, W**2, W**2), mode = -1
|
||||
# attn11: (B*H, mx, my, W**2, 2*W**2), mode > 0
|
||||
attn11 = self.longform2d_mm(q_img, k_img, False, mode)
|
||||
|
||||
if self.rpe:
|
||||
if Nglo >= 1:
|
||||
# local to global bias
|
||||
attn10 = attn10 + self.g2l_relative_position_bias[1].unsqueeze(0).expand(B, -1, -1).reshape(B*H, Nglo)[:, None, None, None, :]
|
||||
# local to local bias
|
||||
if mode == -1:
|
||||
relative_position_index = self.relative_position_index[:, 4 * W2:5 * W2].contiguous()
|
||||
elif mode == 0:
|
||||
relative_position_index = self.relative_position_index
|
||||
else: # mode > 0
|
||||
chunk_id = mode if mode > 4 else mode - 1
|
||||
relative_position_index = torch.cat([
|
||||
self.relative_position_index[:, 4 * W2:5 * W2],
|
||||
self.relative_position_index[:, chunk_id * W2:(chunk_id+1) * W2],
|
||||
], dim=-1)
|
||||
local_relative_position_bias = self.local_relative_position_bias_table[
|
||||
relative_position_index.view(-1)].view(1, W2, kv_nums, -1) # w^2, kv_nums,H
|
||||
local_relative_position_bias = local_relative_position_bias.permute(
|
||||
0, 3, 1, 2).expand(B, -1, -1, -1).contiguous().view(B*H, W2, kv_nums) # B*H, w^2, kv_nums
|
||||
attn11 = attn11 + local_relative_position_bias[:, None, None, :, :]
|
||||
|
||||
num_invalid = mask_invalid_locations(
|
||||
attn11, mx, my, padx, pady, W, exact=self.exact, mode=mode
|
||||
)
|
||||
attn1 = torch.cat((attn10, attn11), dim=-1)
|
||||
|
||||
attn1 = (attn1 - torch.max(attn1, dim=-1, keepdim=True)[0]).softmax(dim=-1)
|
||||
attn1 = self.attn_drop(attn1)
|
||||
|
||||
# update x1: (B, self.num_heads, Nloc, self.head_dim)
|
||||
if self.only_glo:
|
||||
x1 = torch.bmm(
|
||||
attn1.view(B * H, Nloc, Nglo), v[:, :, :Nglo].reshape(B * H, Nglo, M)
|
||||
).view(B, H, Nloc, M)
|
||||
else:
|
||||
attnl2g = attn1[:, :, :, :, :Nglo]
|
||||
x1 = self.longform2d_mm(attn1[:, :, :, :, Nglo:Nglo+kv_nums], v_img, True, mode)
|
||||
if Nglo >= 1:
|
||||
x1 = x1 + einsum(
|
||||
'b m n l t, b t c -> b c m n l', attnl2g,
|
||||
v[:, :, :Nglo].reshape(B * H, Nglo, M)
|
||||
)
|
||||
x1 = rearrange(x1, 'b c m n (x y) -> b (m x) (n y) c', x=W)
|
||||
x1 = x1[:, :nx, :ny].reshape(B, H, Nloc, M)
|
||||
x1 = x1.transpose(1, 2).reshape(B, Nloc, C)
|
||||
|
||||
try:
|
||||
x1 = self.proj(x1)
|
||||
except RuntimeError as e:
|
||||
# guard against possible half vs float error
|
||||
x1 = self.proj(x1.float())
|
||||
|
||||
if Nglo == 0:
|
||||
return self.proj_drop(x1)
|
||||
|
||||
# compute the glocal attention; same with vanilla multi-head attention
|
||||
q_global = self.scale * self.query_global(x[:, :Nglo]).reshape(B, Nglo, H, M).transpose(1, 2)
|
||||
kv_global = self.kv_global(x).reshape(B, N, 2, H, M).permute(2, 0, 3, 1, 4)
|
||||
k_global, v_global = kv_global[0], kv_global[1] # make torchscript happy (cannot use tensor as tuple)
|
||||
# attention matrix
|
||||
attn0 = torch.bmm(q_global.reshape(B*H, Nglo, M), k_global.reshape(B*H, N, M).transpose(-2, -1))
|
||||
if self.rpe:
|
||||
# relative position embedding of global tokens
|
||||
global_relative_position_bias = torch.cat([
|
||||
self.g2g_relative_position_bias,
|
||||
self.g2l_relative_position_bias[0].unsqueeze(-1).expand(-1, -1, Nloc)
|
||||
], dim=-1) # nH, nglo, N
|
||||
attn0 = attn0 + global_relative_position_bias.unsqueeze(0).expand(B, -1, -1, -1).reshape(B*H, Nglo, N)
|
||||
|
||||
attn0 = (attn0 - torch.max(attn0, dim=-1, keepdim=True)[0]).softmax(dim=-1)
|
||||
attn0 = self.attn_drop(attn0)
|
||||
# context vector
|
||||
x0 = torch.bmm(attn0, v_global.reshape(B*H, N, M)).view(B, H, Nglo, M).transpose(1, 2).reshape(B, Nglo, C)
|
||||
x0 = self.proj_global(x0)
|
||||
|
||||
return self.proj_drop(torch.cat((x0, x1), dim=1))
|
||||
|
||||
@staticmethod
|
||||
def compute_macs(module, input, output):
|
||||
# T: num_token
|
||||
# S: num_token
|
||||
input = input[0]
|
||||
_, T, C = input.shape
|
||||
S = T
|
||||
Nglo, H, M, W = module.Nglo, module.num_heads, module.head_dim, module.attention_window
|
||||
macs = 0
|
||||
n_params = 0
|
||||
|
||||
# Sliding window scaled-dot-product macs
|
||||
if module.only_glo:
|
||||
# local to global
|
||||
# [B x T x (C-Nglo)] x [B x C x Nglo] --> [B x T x Nglo]
|
||||
num_macs_kq = (C - Nglo) * Nglo * C
|
||||
else:
|
||||
# local to local
|
||||
# [B x T x (C-Nglo)] x [B x C x (S-Nglo)] --> [B x (C-Nglo) x (9 * W**2)]
|
||||
num_macs_kq = (C-Nglo) * (9 * W**2) * C
|
||||
# local to global
|
||||
# [B x T x (C-Nglo)] x [B x C x Nglo] --> [B x T x Nglo]
|
||||
num_macs_kq += (C-Nglo) * Nglo * C
|
||||
# global to all
|
||||
# [B x T x Nglo] x [B x C x S] --> [B x Nglo x S]
|
||||
num_macs_kq += Nglo * S * C
|
||||
# same computational cost for attn * v -> context
|
||||
num_macs_v = num_macs_kq
|
||||
|
||||
macs += num_macs_kq + num_macs_v
|
||||
# print('macs att', macs / 1e8)
|
||||
|
||||
# self attention: T should be equal to S
|
||||
assert T == S
|
||||
# by default, we share weights for local and global tokens
|
||||
q_params = sum([p.numel() for p in module.query.parameters()])
|
||||
kv_params = sum([p.numel() for p in module.kv.parameters()])
|
||||
n_params += q_params + kv_params
|
||||
# multiply by Seq length
|
||||
macs += (q_params + kv_params) * T
|
||||
# print('macs qkv', qkv_params * T / 1e8)
|
||||
|
||||
# by default, we share weights for local and global tokens
|
||||
proj_params = sum([p.numel() for p in module.proj.parameters()])
|
||||
n_params += proj_params
|
||||
macs += (proj_params * T)
|
||||
# print('macs proj', proj_params * T / 1e8)
|
||||
|
||||
module.__flops__ += macs
|
||||
# return n_params, macs
|
|
@ -0,0 +1,657 @@
|
|||
import math
|
||||
from functools import partial
|
||||
import logging
|
||||
import torch
|
||||
from torch import nn
|
||||
from timm.models.layers import DropPath, trunc_normal_, to_2tuple
|
||||
from .longformer2d import Long2DSCSelfAttention
|
||||
|
||||
|
||||
class Mlp(nn.Module):
|
||||
def __init__(self, in_features, hidden_features=None, out_features=None,
|
||||
act_layer=nn.GELU, drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.drop(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None,
|
||||
attn_drop=0., proj_drop=0.,
|
||||
rpe=False, wx=14, wy=14, nglo=1):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
|
||||
self.scale = qk_scale or head_dim ** -0.5
|
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
# Inspired by swin transformer:
|
||||
# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L88-L103
|
||||
# define parameter tables for local and global relative position bias
|
||||
self.rpe = rpe
|
||||
if rpe:
|
||||
self.wx = wx
|
||||
self.wy = wy
|
||||
self.nglo = nglo
|
||||
self.local_relative_position_bias_table = nn.Parameter(
|
||||
torch.zeros((2 * wx - 1) * (2 * wy - 1),
|
||||
num_heads)) # (2*wx-1, 2*wy-1, nH)
|
||||
trunc_normal_(self.local_relative_position_bias_table, std=.02)
|
||||
if nglo >= 1:
|
||||
self.g2l_relative_position_bias = nn.Parameter(
|
||||
torch.zeros(2, num_heads, nglo)) # (2, nH, nglo)
|
||||
self.g2g_relative_position_bias = nn.Parameter(
|
||||
torch.zeros(num_heads, nglo, nglo)) # (nH, nglo, nglo)
|
||||
trunc_normal_(self.g2l_relative_position_bias, std=.02)
|
||||
trunc_normal_(self.g2g_relative_position_bias, std=.02)
|
||||
|
||||
# get pair-wise relative position index
|
||||
coords_h = torch.arange(wx)
|
||||
coords_w = torch.arange(wy)
|
||||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, wx, wy
|
||||
coords_flatten = torch.flatten(coords, 1) # 2, Wx*Wy
|
||||
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wx*Wy, Wx*Wy
|
||||
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wx*Wy, Wx*Wy, 2
|
||||
relative_coords[:, :, 0] += wx - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += wy - 1
|
||||
relative_coords[:, :, 0] *= 2 * wy - 1
|
||||
relative_position_index = relative_coords.sum(-1) # Wx*Wy, Wx*Wy
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
|
||||
def forward(self, x, nx=None, ny=None):
|
||||
B, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
||||
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
|
||||
|
||||
attn = (q @ k.transpose(-2, -1)) * self.scale
|
||||
if self.rpe:
|
||||
assert N == self.nglo + self.wx*self.wy, "For relative position, N != self.nglo + self.wx*self.wy!"
|
||||
local_relative_position_bias = self.local_relative_position_bias_table[
|
||||
self.relative_position_index.view(-1)].view(
|
||||
self.wx*self.wy, self.wx*self.wy, -1) # Wh*Ww, Wh*Ww,nH
|
||||
relative_position_bias = local_relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
|
||||
if self.nglo > 0:
|
||||
# relative position embedding of global tokens
|
||||
global_relative_position_bias = torch.cat([
|
||||
self.g2g_relative_position_bias,
|
||||
self.g2l_relative_position_bias[0].unsqueeze(-1).expand(-1, -1, self.wx*self.wy)
|
||||
], dim=-1) # nH, nglo, N
|
||||
# relative position embedding of local tokens
|
||||
local_relative_position_bias = torch.cat([
|
||||
self.g2l_relative_position_bias[1].unsqueeze(1).expand(-1, self.wx*self.wy, -1),
|
||||
relative_position_bias,
|
||||
], dim=-1) # nH, Wh*Ww, N
|
||||
relative_position_bias = torch.cat([
|
||||
global_relative_position_bias,
|
||||
local_relative_position_bias,
|
||||
], dim=1) # nH, N, N
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
|
||||
attn = (attn - torch.max(attn, dim=-1, keepdim=True)[0]).softmax(dim=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
||||
x = self.proj(x)
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def compute_macs(module, input, output):
|
||||
# T: num_token
|
||||
# S: num_token
|
||||
input = input[0]
|
||||
_, T, C = input.shape
|
||||
S = T
|
||||
macs = 0
|
||||
n_params = 0
|
||||
|
||||
# Scaled-dot-product macs
|
||||
# [B x T x C] x [B x C x S] --> [B x T x S]
|
||||
# multiplication-addition is counted as 1 because operations can be fused
|
||||
num_macs_kq = T * S * C
|
||||
# [B x T x S] x [B x S x C] --> [B x T x C]
|
||||
num_macs_v = T * C * S
|
||||
|
||||
macs += num_macs_kq + num_macs_v
|
||||
# print('macs att', macs / 1e8)
|
||||
|
||||
# self attention: T should be equal to S
|
||||
assert T == S
|
||||
qkv_params = sum([p.numel() for p in module.qkv.parameters()])
|
||||
n_params += qkv_params
|
||||
# multiply by Seq length
|
||||
macs += qkv_params * T
|
||||
# print('macs qkv', qkv_params * T / 1e8)
|
||||
|
||||
proj_params = sum([p.numel() for p in module.proj.parameters()])
|
||||
n_params += proj_params
|
||||
macs += (proj_params * T)
|
||||
# print('macs proj', proj_params * T / 1e8)
|
||||
|
||||
module.__flops__ += macs
|
||||
# return n_params, macs
|
||||
|
||||
|
||||
class PatchEmbed(nn.Module):
|
||||
""" Image to Patch Embedding
|
||||
"""
|
||||
|
||||
def __init__(self, patch_size, nx, ny, in_chans=3, embed_dim=768, nglo=1,
|
||||
norm_layer=partial(nn.LayerNorm, eps=1e-6), norm_embed=True,
|
||||
drop_rate=0.0, ape=True):
|
||||
# maximal global/x-direction/y-direction tokens: nglo, nx, ny
|
||||
super().__init__()
|
||||
patch_size = to_2tuple(patch_size)
|
||||
self.patch_size = patch_size
|
||||
|
||||
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size,
|
||||
stride=patch_size)
|
||||
|
||||
self.norm_embed = norm_layer(embed_dim) if norm_embed else None
|
||||
|
||||
self.nx = nx
|
||||
self.ny = ny
|
||||
self.Nglo = nglo
|
||||
if nglo >= 1:
|
||||
self.cls_token = nn.Parameter(torch.zeros(1, nglo, embed_dim))
|
||||
trunc_normal_(self.cls_token, std=.02)
|
||||
else:
|
||||
self.cls_token = None
|
||||
self.ape = ape
|
||||
if ape:
|
||||
self.cls_pos_embed = nn.Parameter(torch.zeros(1, nglo, embed_dim))
|
||||
self.x_pos_embed = nn.Parameter(torch.zeros(1, nx, embed_dim // 2))
|
||||
self.y_pos_embed = nn.Parameter(torch.zeros(1, ny, embed_dim // 2))
|
||||
trunc_normal_(self.cls_pos_embed, std=.02)
|
||||
trunc_normal_(self.x_pos_embed, std=.02)
|
||||
trunc_normal_(self.y_pos_embed, std=.02)
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
def forward(self, xtuple):
|
||||
x, nx, ny = xtuple
|
||||
B = x.shape[0]
|
||||
|
||||
x = self.proj(x)
|
||||
nx, ny = x.shape[-2:]
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
assert nx <= self.nx and ny <= self.ny, "Input size {} {} should <= {} {}!".format(nx, ny, self.nx, self.ny)
|
||||
|
||||
if self.norm_embed:
|
||||
x = self.norm_embed(x)
|
||||
|
||||
# concat cls_token
|
||||
if self.cls_token is not None:
|
||||
cls_tokens = self.cls_token.expand(
|
||||
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
|
||||
x = torch.cat((cls_tokens, x), dim=1)
|
||||
|
||||
if self.ape:
|
||||
# add position embedding
|
||||
i = torch.arange(nx, device=x.device)
|
||||
j = torch.arange(ny, device=x.device)
|
||||
x_emb = self.x_pos_embed[:, i, :]
|
||||
y_emb = self.y_pos_embed[:, j, :]
|
||||
pos_embed_2d = torch.cat([
|
||||
x_emb.unsqueeze(2).expand(-1, -1, ny, -1),
|
||||
y_emb.unsqueeze(1).expand(-1, nx, -1, -1),
|
||||
], dim=-1).flatten(start_dim=1, end_dim=2)
|
||||
x = x + torch.cat([self.cls_pos_embed, pos_embed_2d], dim=1).expand(
|
||||
B, -1, -1)
|
||||
|
||||
x = self.pos_drop(x)
|
||||
|
||||
return x, nx, ny
|
||||
|
||||
|
||||
def init_(tensor):
|
||||
dim = tensor.shape[-1]
|
||||
std = 1 / math.sqrt(dim)
|
||||
tensor.uniform_(-std, std)
|
||||
return tensor
|
||||
|
||||
|
||||
# for Performer, start
|
||||
def get_module_device(module):
|
||||
return next(module.parameters()).device
|
||||
|
||||
|
||||
def find_modules(nn_module, type):
|
||||
return [module for module in nn_module.modules() if isinstance(module, type)]
|
||||
|
||||
# for Performer, end
|
||||
|
||||
|
||||
class AttnBlock(nn.Module):
|
||||
""" Meta Attn Block
|
||||
"""
|
||||
def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0., norm_layer=nn.LayerNorm,
|
||||
attn_type='full', w=7, d=1, sharew=False, nglo=1,
|
||||
only_glo=False,
|
||||
seq_len=None, num_feats=256, share_kv=False, sw_exact=0,
|
||||
rratio=2, rpe=False, wx=14, wy=14,
|
||||
mode=0):
|
||||
super().__init__()
|
||||
self.norm = norm_layer(dim)
|
||||
if attn_type == 'full':
|
||||
self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale, attn_drop=attn_drop,
|
||||
proj_drop=drop,
|
||||
rpe=rpe, wx=wx, wy=wy, nglo=nglo)
|
||||
elif attn_type == 'longformerhand':
|
||||
self.attn = Long2DSCSelfAttention(
|
||||
dim, exact=sw_exact, num_heads=num_heads, qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale, attn_drop=attn_drop,
|
||||
proj_drop=drop, w=w, d=d, sharew=sharew,
|
||||
nglo=nglo, only_glo=only_glo, autograd=False,
|
||||
rpe=rpe, mode=mode
|
||||
)
|
||||
elif attn_type == 'longformerauto':
|
||||
self.attn = Long2DSCSelfAttention(
|
||||
dim, exact=sw_exact, num_heads=num_heads, qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale, attn_drop=attn_drop,
|
||||
proj_drop=drop, w=w, d=d, sharew=sharew,
|
||||
nglo=nglo, only_glo=only_glo, autograd=True,
|
||||
rpe=rpe, mode=mode
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Not supported attention type {}".format(attn_type))
|
||||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
||||
self.drop_path = DropPath(
|
||||
drop_path) if drop_path > 0. else nn.Identity()
|
||||
|
||||
def forward(self, xtuple):
|
||||
x, nx, ny = xtuple
|
||||
x = x + self.drop_path(self.attn(self.norm(x), nx, ny))
|
||||
return x, nx, ny
|
||||
|
||||
|
||||
class MlpBlock(nn.Module):
|
||||
""" Meta MLP Block
|
||||
"""
|
||||
|
||||
def __init__(self, dim, out_dim=None, mlp_ratio=4., drop=0., drop_path=0.,
|
||||
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
|
||||
super().__init__()
|
||||
self.drop_path = DropPath(
|
||||
drop_path) if drop_path > 0. else nn.Identity()
|
||||
self.norm = norm_layer(dim)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
|
||||
out_features=out_dim, act_layer=act_layer, drop=drop)
|
||||
self.shortcut = nn.Identity()
|
||||
if out_dim is not None and out_dim != dim:
|
||||
self.shortcut = nn.Sequential(nn.Linear(dim, out_dim),
|
||||
nn.Dropout(drop))
|
||||
|
||||
def forward(self, xtuple):
|
||||
x, nx, ny = xtuple
|
||||
x = self.shortcut(x) + self.drop_path(self.mlp(self.norm(x)))
|
||||
return x, nx, ny
|
||||
|
||||
|
||||
def parse_arch(layer_cfgstr):
|
||||
layer_cfg = {'l': 1, 'h': 3, 'd': 192, 'n': 1, 's': 1, 'g': 1,
|
||||
'p': 2, 'f': 7, 'a': 0} # defaults
|
||||
for attr in layer_cfgstr.split(','):
|
||||
layer_cfg[attr[0]] = int(attr[1:])
|
||||
return layer_cfg
|
||||
|
||||
|
||||
class MsViT(nn.Module):
|
||||
""" Multiscale Vision Transformer with support for patch or hybrid CNN input stage
|
||||
"""
|
||||
def __init__(self, arch, img_size=512, in_chans=3,
|
||||
num_classes=1000,
|
||||
qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
|
||||
drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
|
||||
norm_embed=False, w=7, d=1, sharew=False, only_glo=False,
|
||||
share_kv=False,
|
||||
attn_type='longformerhand', sw_exact=0, mode=0,
|
||||
out_features=None,
|
||||
freeze_at=0, #detectron2
|
||||
**args):
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
|
||||
if 'ln_eps' in args:
|
||||
ln_eps = args['ln_eps']
|
||||
self.norm_layer = partial(nn.LayerNorm, eps=ln_eps)
|
||||
logging.info("Customized LayerNorm EPS: {}".format(ln_eps))
|
||||
else:
|
||||
self.norm_layer = norm_layer
|
||||
self.drop_path_rate = drop_path_rate
|
||||
self.attn_type = attn_type
|
||||
|
||||
self.attn_args = dict({
|
||||
'attn_type': attn_type,
|
||||
'qkv_bias': qkv_bias,
|
||||
'qk_scale': qk_scale,
|
||||
'drop': drop_rate,
|
||||
'attn_drop': attn_drop_rate,
|
||||
'w': w,
|
||||
'd': d,
|
||||
'sharew': sharew,
|
||||
'only_glo': only_glo,
|
||||
'share_kv': share_kv,
|
||||
'sw_exact': sw_exact,
|
||||
'norm_layer': norm_layer,
|
||||
'mode': mode,
|
||||
})
|
||||
self.patch_embed_args = dict({
|
||||
'norm_layer': norm_layer,
|
||||
'norm_embed': norm_embed,
|
||||
'drop_rate': drop_rate,
|
||||
})
|
||||
self.mlp_args = dict({
|
||||
'mlp_ratio': 4.0,
|
||||
'norm_layer': norm_layer,
|
||||
'act_layer': nn.GELU,
|
||||
'drop': drop_rate,
|
||||
})
|
||||
|
||||
# Attributes for maskrcnn
|
||||
assert out_features, "out_features is empty!"
|
||||
self._out_feature_strides = []
|
||||
self._out_feature_channels = []
|
||||
self._out_features = out_features
|
||||
self.frozen_stages = freeze_at
|
||||
|
||||
self.layer_cfgs = [parse_arch(layer) for layer in arch.split('_')]
|
||||
self.num_layers = len(self.layer_cfgs)
|
||||
self.depth = sum([cfg['n'] for cfg in self.layer_cfgs])
|
||||
self.out_planes = self.layer_cfgs[-1]['d']
|
||||
self.Nglos = [cfg['g'] for cfg in self.layer_cfgs]
|
||||
self.avg_pool = args['avg_pool'] if 'avg_pool' in args else False
|
||||
|
||||
# ensure divisibility
|
||||
stride = 1
|
||||
down_strides = []
|
||||
for cfg in self.layer_cfgs:
|
||||
stride *= cfg['p']
|
||||
down_strides.append(stride)
|
||||
self._size_divisibility = stride
|
||||
self.Nx = (img_size + (stride - 1)) // stride * stride
|
||||
self.Ny = (img_size + (stride - 1)) // stride * stride
|
||||
|
||||
dprs = torch.linspace(0, drop_path_rate, self.depth).split(
|
||||
[cfg['n'] for cfg in self.layer_cfgs]
|
||||
) # stochastic depth decay rule
|
||||
self.layer1 = self._make_layer(in_chans, self.layer_cfgs[0],
|
||||
dprs=dprs[0], layerid=1)
|
||||
if "layer1" in self._out_features:
|
||||
self._out_feature_strides.append(down_strides[0])
|
||||
self._out_feature_channels.append(self.layer_cfgs[0]['d'])
|
||||
|
||||
self.layer2 = self._make_layer(self.layer_cfgs[0]['d'],
|
||||
self.layer_cfgs[1], dprs=dprs[1],
|
||||
layerid=2)
|
||||
if "layer2" in self._out_features:
|
||||
self._out_feature_strides.append(down_strides[1])
|
||||
self._out_feature_channels.append(self.layer_cfgs[1]['d'])
|
||||
|
||||
self.layer3 = self._make_layer(self.layer_cfgs[1]['d'],
|
||||
self.layer_cfgs[2], dprs=dprs[2],
|
||||
layerid=3)
|
||||
if "layer3" in self._out_features:
|
||||
self._out_feature_strides.append(down_strides[2])
|
||||
self._out_feature_channels.append(self.layer_cfgs[2]['d'])
|
||||
|
||||
if self.num_layers == 3:
|
||||
self.layer4 = None
|
||||
elif self.num_layers == 4:
|
||||
self.layer4 = self._make_layer(self.layer_cfgs[2]['d'],
|
||||
self.layer_cfgs[3], dprs=dprs[3],
|
||||
layerid=4)
|
||||
if "layer4" in self._out_features:
|
||||
self._out_feature_strides.append(down_strides[3])
|
||||
self._out_feature_channels.append(self.layer_cfgs[3]['d'])
|
||||
else:
|
||||
raise ValueError("Numer of layers {} not implemented yet!".format(self.num_layers))
|
||||
|
||||
assert self._size_divisibility==stride, "Some stride down layer has been ignored!"
|
||||
|
||||
self.apply(self._init_weights)
|
||||
|
||||
def _freeze_stages(self):
|
||||
if self.frozen_stages <= 0:
|
||||
return
|
||||
|
||||
if self.frozen_stages >= 1:
|
||||
# froze the first patch embeding layer
|
||||
self.layer1[0].eval()
|
||||
for param in self.layer1[0].parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
if self.frozen_stages >= 2:
|
||||
# froze layer1 to layer{frozen_stages-1}
|
||||
for i in range(1, self.frozen_stages):
|
||||
m = getattr(self, "layer" + str(i))
|
||||
m.eval()
|
||||
for param in m.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def train(self, mode=True):
|
||||
"""Convert the model into training mode while keep layers freezed."""
|
||||
super(MsViT, self).train(mode)
|
||||
self._freeze_stages()
|
||||
|
||||
def reset_vil_mode(self, mode):
|
||||
longformer_attentions = find_modules(self, Long2DSCSelfAttention)
|
||||
for longformer_attention in longformer_attentions:
|
||||
mode_old = longformer_attention.mode
|
||||
if mode_old != mode:
|
||||
longformer_attention.mode = mode
|
||||
logging.info(
|
||||
"Change vil attention mode from {} to {} in " "layer {}"
|
||||
.format(mode_old, mode, longformer_attention))
|
||||
return
|
||||
|
||||
@property
|
||||
def size_divisibility(self):
|
||||
return self._size_divisibility
|
||||
|
||||
def _make_layer(self, in_dim, layer_cfg, dprs, layerid=0):
|
||||
layer_id, num_heads, dim, num_block, is_sparse_attn, nglo, patch_size, num_feats, ape \
|
||||
= layer_cfg['l'], layer_cfg['h'], layer_cfg['d'], layer_cfg['n'], \
|
||||
layer_cfg['s'], layer_cfg['g'], layer_cfg['p'], layer_cfg['f'], \
|
||||
layer_cfg['a']
|
||||
assert layerid == layer_id, "Error in _make_layer: layerid {} does not equal to layer_id {}".format(layerid, layer_id)
|
||||
self.Nx = nx = self.Nx // patch_size
|
||||
self.Ny = ny = self.Ny // patch_size
|
||||
seq_len = nx * ny + nglo
|
||||
|
||||
self.attn_args['nglo'] = nglo
|
||||
self.patch_embed_args['nglo'] = nglo
|
||||
self.attn_args['num_feats'] = num_feats # shared for linformer and performer
|
||||
self.attn_args['rratio'] = num_feats # srformer reuses this parameter
|
||||
self.attn_args['w'] = num_feats # longformer reuses this parameter
|
||||
if is_sparse_attn == 0:
|
||||
self.attn_args['attn_type'] = 'full'
|
||||
|
||||
# patch embedding
|
||||
layers = [
|
||||
PatchEmbed(patch_size, nx, ny, in_chans=in_dim, embed_dim=dim, ape=ape,
|
||||
**self.patch_embed_args)
|
||||
]
|
||||
for dpr in dprs:
|
||||
layers.append(AttnBlock(
|
||||
dim, num_heads, drop_path=dpr, seq_len=seq_len, rpe=not ape,
|
||||
wx=nx, wy=ny,
|
||||
**self.attn_args
|
||||
))
|
||||
layers.append(MlpBlock(dim, drop_path=dpr, **self.mlp_args))
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight, std=.02)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
nn.init.constant_(m.weight, 1.0)
|
||||
|
||||
@torch.jit.ignore
|
||||
def no_weight_decay(self):
|
||||
no_decay = {'pos_embed', 'cls_token',
|
||||
'norm.weight', 'norm.bias',
|
||||
'norm_embed', 'head.bias',
|
||||
'relative_position'}
|
||||
return no_decay
|
||||
|
||||
def get_classifier(self):
|
||||
return self.head
|
||||
|
||||
def forward(self, x):
|
||||
B = x.shape[0]
|
||||
outputs = []
|
||||
x, nx, ny = self.layer1((x, None, None))
|
||||
if "layer1" in self._out_features:
|
||||
outputs.append(
|
||||
x[:, self.Nglos[0]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
)
|
||||
|
||||
x = x[:, self.Nglos[0]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
x, nx, ny = self.layer2((x, nx, ny))
|
||||
if "layer2" in self._out_features:
|
||||
outputs.append(
|
||||
x[:, self.Nglos[1]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
)
|
||||
|
||||
x = x[:, self.Nglos[1]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
x, nx, ny = self.layer3((x, nx, ny))
|
||||
if "layer3" in self._out_features:
|
||||
outputs.append(
|
||||
x[:, self.Nglos[2]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
)
|
||||
|
||||
if self.layer4 is not None:
|
||||
x = x[:, self.Nglos[2]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
x, nx, ny = self.layer4((x, nx, ny))
|
||||
if "layer4" in self._out_features:
|
||||
outputs.append(
|
||||
x[:, self.Nglos[3]:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def build_msvit_backbone(cfg):
|
||||
args = dict(
|
||||
img_size=cfg.INPUT.MAX_SIZE_TRAIN,
|
||||
drop_rate=cfg.MODEL.TRANSFORMER.DROP,
|
||||
drop_path_rate=cfg.MODEL.TRANSFORMER.DROP_PATH,
|
||||
norm_embed=cfg.MODEL.TRANSFORMER.NORM_EMBED,
|
||||
avg_pool=cfg.MODEL.TRANSFORMER.AVG_POOL,
|
||||
freeze_at=cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT,
|
||||
out_features=cfg.MODEL.TRANSFORMER.OUT_FEATURES
|
||||
)
|
||||
args['arch'] = cfg.MODEL.TRANSFORMER.MSVIT.ARCH
|
||||
args['sharew'] = cfg.MODEL.TRANSFORMER.MSVIT.SHARE_W
|
||||
args['attn_type'] = cfg.MODEL.TRANSFORMER.MSVIT.ATTN_TYPE
|
||||
args['share_kv'] = cfg.MODEL.TRANSFORMER.MSVIT.SHARE_KV
|
||||
args['only_glo'] = cfg.MODEL.TRANSFORMER.MSVIT.ONLY_GLOBAL
|
||||
args['sw_exact'] = cfg.MODEL.TRANSFORMER.MSVIT.SW_EXACT
|
||||
args['ln_eps'] = cfg.MODEL.TRANSFORMER.MSVIT.LN_EPS
|
||||
args['mode'] = cfg.MODEL.TRANSFORMER.MSVIT.MODE
|
||||
|
||||
return MsViT(**args)
|
||||
|
||||
|
||||
class ViTHead(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_dim, layer_cfgstr, input_size=14,
|
||||
qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
|
||||
drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
|
||||
norm_embed=False, **args
|
||||
):
|
||||
super(ViTHead, self).__init__()
|
||||
if 'ln_eps' in args:
|
||||
ln_eps = args['ln_eps']
|
||||
self.norm_layer = partial(nn.LayerNorm, eps=ln_eps)
|
||||
logging.info("Customized LayerNorm EPS: {}".format(ln_eps))
|
||||
else:
|
||||
self.norm_layer = norm_layer
|
||||
self.drop_path_rate = drop_path_rate
|
||||
|
||||
self.attn_args = dict({
|
||||
'attn_type': 'full', # full attention for head
|
||||
'qkv_bias': qkv_bias,
|
||||
'qk_scale': qk_scale,
|
||||
'drop': drop_rate,
|
||||
'attn_drop': attn_drop_rate,
|
||||
'norm_layer': norm_layer,
|
||||
'drop_path': drop_path_rate,
|
||||
})
|
||||
self.patch_embed_args = dict({
|
||||
'norm_layer': norm_layer,
|
||||
'norm_embed': norm_embed,
|
||||
'drop_rate': drop_rate,
|
||||
})
|
||||
self.mlp_args = dict({
|
||||
'mlp_ratio': 4.0,
|
||||
'norm_layer': norm_layer,
|
||||
'act_layer': nn.GELU,
|
||||
'drop': drop_rate,
|
||||
'drop_path': drop_path_rate,
|
||||
})
|
||||
|
||||
layer_cfg = parse_arch(layer_cfgstr)
|
||||
layer_id, num_heads, dim, num_block, is_sparse_attn, nglo, patch_size, num_feats, ape \
|
||||
= layer_cfg['l'], layer_cfg['h'], layer_cfg['d'], layer_cfg['n'], \
|
||||
layer_cfg['s'], layer_cfg['g'], layer_cfg['p'], layer_cfg['f'], \
|
||||
layer_cfg['a']
|
||||
self.input_size = input_size
|
||||
self.nglo = nglo
|
||||
assert input_size%patch_size == 0, "Input size is not divided by patch size in ViTHead!"
|
||||
assert nglo == 0, "Number of global tokens in ViTHead is not 0!"
|
||||
nx = self.input_size // patch_size
|
||||
ny = self.input_size // patch_size
|
||||
seq_len = nx * ny + nglo
|
||||
|
||||
# patch embedding
|
||||
layers = [
|
||||
PatchEmbed(patch_size, nx, ny, in_chans=in_dim, embed_dim=dim,
|
||||
ape=ape, nglo=nglo, **self.patch_embed_args)
|
||||
]
|
||||
for block_id in range(num_block):
|
||||
layers.append(AttnBlock(
|
||||
dim, num_heads, seq_len=seq_len, rpe=not ape,
|
||||
wx=nx, wy=ny, nglo=nglo,
|
||||
**self.attn_args
|
||||
))
|
||||
layers.append(MlpBlock(dim, **self.mlp_args))
|
||||
self.layer4 = nn.Sequential(*layers)
|
||||
self.norm = norm_layer(dim)
|
||||
self.out_channels = dim
|
||||
|
||||
def forward(self, x):
|
||||
B, C, nx, ny = x.shape
|
||||
assert nx == ny == self.input_size, "Input size does not match the initialized size in ViThead!"
|
||||
nglo = self.nglo
|
||||
x, nx, ny = self.layer4((x, None, None))
|
||||
x = self.norm(x)
|
||||
x = x[:, nglo:].transpose(-2, -1).reshape(B, -1, nx, ny)
|
||||
return x
|
|
@ -0,0 +1,366 @@
|
|||
# Copyright (c) 2021 Microsoft Corporation. Licensed under the MIT license.
|
||||
# Written by Pengchuan Zhang, penzhan@microsoft.com
|
||||
from functools import lru_cache
|
||||
import torch
|
||||
from torch import einsum
|
||||
from torch.cuda.amp import autocast
|
||||
|
||||
|
||||
class SlidingChunk2D(torch.autograd.Function):
|
||||
"""
|
||||
Class to encapsulate for sliding chunk implementation of vision longformer
|
||||
"""
|
||||
mode_dict = {
|
||||
1: (1, 1), # -1, -1
|
||||
2: (1, 0), # -1, 0
|
||||
3: (1, -1), # -1, 1
|
||||
4: (0, 1), # 0, -1
|
||||
5: (0, -1), # 0, 1
|
||||
6: (-1, 1), # 1, -1
|
||||
7: (-1, 0), # 1, 0
|
||||
8: (-1, -1), # 1, 1
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def slidingchunk_qk(q_img: torch.Tensor, k_img: torch.Tensor, mode: int):
|
||||
'''
|
||||
q_img x k_img = attn11 ==> Useful for query x key = attention_scores
|
||||
The cyclic padding strategy
|
||||
q_img, k_img: (B * H, M, mx, my, W**2)
|
||||
attn11: (B*H, mx, my, W**2, 9*W**2), mode=0
|
||||
(B*H, mx, my, W**2, W**2), mode=-1
|
||||
(B*H, mx, my, W**2, 2*W**2), mode=i>0
|
||||
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
|
||||
'''
|
||||
if mode == 0:
|
||||
return torch.cat([
|
||||
# -1, -1
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=(1, 1), dims=(2, 3))),
|
||||
# -1, 0
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=1, dims=2)),
|
||||
# -1, 1
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=(1, -1), dims=(2, 3))),
|
||||
# 0, -1
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=1, dims=3)),
|
||||
# 0, 0
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
k_img),
|
||||
# 0, 1
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=-1, dims=3)),
|
||||
# 1, -1
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=(-1, 1), dims=(2, 3))),
|
||||
# 1, 0
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=-1, dims=2)),
|
||||
# 1, 1
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=(-1, -1), dims=(2, 3))),
|
||||
], dim=-1)
|
||||
elif mode == -1:
|
||||
return einsum(
|
||||
'b c m n l, b c m n t -> b m n l t', q_img, k_img
|
||||
) * 1.0
|
||||
else:
|
||||
shift = SlidingChunk2D.mode_dict[mode]
|
||||
return torch.cat([
|
||||
# 0, 0
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img, k_img),
|
||||
# x, x
|
||||
einsum('b c m n l, b c m n t -> b m n l t', q_img,
|
||||
torch.roll(k_img, shifts=shift, dims=(2, 3))),
|
||||
], dim=-1)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def slidingchunk_av(attn: torch.Tensor, v_img: torch.Tensor, mode: int):
|
||||
'''
|
||||
attn x v_img = x ==> Useful for attn x value = context
|
||||
The cyclic padding strategy
|
||||
v_img, context: (B * H, M, mx, my, W**2)
|
||||
attn: (B*H, mx, my, W**2, 9*W**2), mode=0
|
||||
(B*H, mx, my, W**2, W**2), mode=-1
|
||||
(B*H, mx, my, W**2, 2*W**2), mode=i>0
|
||||
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
|
||||
'''
|
||||
w2 = v_img.shape[-1]
|
||||
if mode == 0:
|
||||
attnn1n1, attnn10, attnn11, attn0n1, attn00, attn01, attn1n1, attn10, attn11 = torch.split(
|
||||
attn, w2, dim=-1
|
||||
)
|
||||
elif mode == -1:
|
||||
attn00 = attn
|
||||
else:
|
||||
attn00, attnxx = torch.split(
|
||||
attn, w2, dim=-1
|
||||
)
|
||||
output = einsum('b m n l t, b c m n t -> b c m n l', attn00, v_img) # 0,0
|
||||
|
||||
if mode == 0:
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn1n1,
|
||||
torch.roll(v_img, shifts=(1, 1), dims=(2, 3))) # -1,-1
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn10,
|
||||
torch.roll(v_img, shifts=1, dims=2)) # -1,0
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn11,
|
||||
torch.roll(v_img, shifts=(1, -1), dims=(2, 3))) # -1,1
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn0n1,
|
||||
torch.roll(v_img, shifts=1, dims=3)) # 0,-1
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn01,
|
||||
torch.roll(v_img, shifts=-1, dims=3)) # 0,1
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn1n1,
|
||||
torch.roll(v_img, shifts=(-1, 1), dims=(2, 3))) # 1,-1
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn10,
|
||||
torch.roll(v_img, shifts=-1, dims=2)) # 1,0
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn11,
|
||||
torch.roll(v_img, shifts=(-1, -1), dims=(2, 3))) # 1,1
|
||||
elif mode > 0:
|
||||
shift = SlidingChunk2D.mode_dict[mode]
|
||||
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnxx,
|
||||
torch.roll(v_img, shifts=shift, dims=(2, 3))) # 1,1
|
||||
else:
|
||||
output = output * 1.0
|
||||
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def slidingchunk_agrad(attn: torch.Tensor, grad_x: torch.Tensor, mode: int):
|
||||
'''
|
||||
attn.t() x grad_x = grad_v ==> Useful for attn.t() x grad_x = grad_v
|
||||
The cyclic padding strategy
|
||||
grad_x, grad_v: (B * H, M, mx, my, W**2)
|
||||
attn: (B*H, mx, my, W**2, 9*W**2), mode=0
|
||||
(B*H, mx, my, W**2, W**2), mode=-1
|
||||
(B*H, mx, my, W**2, 2*W**2), mode=i>0
|
||||
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
|
||||
'''
|
||||
w2 = grad_x.shape[-1]
|
||||
if mode == 0:
|
||||
attnn1n1, attnn10, attnn11, attn0n1, attn00, attn01, attn1n1, attn10, attn11 = torch.split(
|
||||
attn, w2, dim=-1
|
||||
)
|
||||
elif mode == -1:
|
||||
attn00 = attn
|
||||
else:
|
||||
attn00, attnxx = torch.split(
|
||||
attn, w2, dim=-1
|
||||
)
|
||||
|
||||
# 0,0
|
||||
output = einsum('b m n l t, b c m n l -> b c m n t', attn00, grad_x)
|
||||
|
||||
if mode == 0:
|
||||
# -1,-1
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attnn1n1, grad_x),
|
||||
shifts=(-1, -1), dims=(2, 3))
|
||||
# -1,0
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attnn10, grad_x),
|
||||
shifts=-1, dims=2)
|
||||
# -1,1
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attnn11, grad_x),
|
||||
shifts=(-1, 1), dims=(2, 3))
|
||||
# 0,-1
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attn0n1, grad_x),
|
||||
shifts=-1, dims=3)
|
||||
# 0,1
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attn01, grad_x),
|
||||
shifts=1, dims=3)
|
||||
# 1,-1
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attn1n1, grad_x),
|
||||
shifts=(1, -1), dims=(2, 3))
|
||||
# 1,0
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attn10, grad_x),
|
||||
shifts=1, dims=2)
|
||||
# 1,1
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attn11, grad_x),
|
||||
shifts=(1, 1), dims=(2, 3))
|
||||
elif mode > 0:
|
||||
shift = SlidingChunk2D.mode_dict[mode]
|
||||
shift = (-shift[0], -shift[1])
|
||||
output = output + torch.roll(
|
||||
einsum('b m n l t, b c m n l -> b c m n t', attnxx, grad_x),
|
||||
shifts=shift, dims=(2, 3))
|
||||
else:
|
||||
output = output * 1.0
|
||||
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
@autocast() # comment this out if AMP is not used
|
||||
def forward(ctx, t1: torch.Tensor, t2: torch.Tensor,
|
||||
is_t1_diagonaled: bool = False, mode: int = 0) -> torch.Tensor:
|
||||
"""Compuates sliding chunk mm of t1 and t2.
|
||||
args:
|
||||
t1: torch.Tensor = (B * H, M, mx, my, W**2) if is_t1_diagonaled = false,
|
||||
= (B*H, mx, my, W**2, 9*W**2) if is_t1_diagonaled = true, mode=0.
|
||||
= (B*H, mx, my, W**2, W**2) if is_t1_diagonaled = true, mode=-1.
|
||||
= (B*H, mx, my, W**2, 2*W**2) if is_t1_diagonaled = true, mode=i>0.
|
||||
t2: torch.Tensor = (B * H, M, mx, my, W**2). This is always a
|
||||
non-diagonaled tensor, e.g. `key_layer` or `value_layer`
|
||||
is_t1_diagonaled: is t1 a diagonaled or a regular tensor
|
||||
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
|
||||
returns:
|
||||
is_t1_diagonaled = true:
|
||||
torch.Tensor = (B * H, M, mx, my, W**2)
|
||||
mode=0, is_t1_diagonaled = false:
|
||||
torch.Tensor = (B*H, mx, my, W**2, 9*W**2)
|
||||
mode=-1, is_t1_diagonaled = false:
|
||||
torch.Tensor = (B*H, mx, my, W**2, W**2)
|
||||
mode=i>0, is_t1_diagonaled = false:
|
||||
torch.Tensor = (B*H, mx, my, W**2, W**2)
|
||||
"""
|
||||
ctx.save_for_backward(t1, t2)
|
||||
ctx.is_t1_diagonaled = is_t1_diagonaled
|
||||
ctx.mode = mode
|
||||
if is_t1_diagonaled:
|
||||
return SlidingChunk2D.slidingchunk_av(t1, t2, mode)
|
||||
else:
|
||||
return SlidingChunk2D.slidingchunk_qk(t1, t2, mode)
|
||||
|
||||
@staticmethod
|
||||
@autocast() # comment this out if AMP is not used
|
||||
def backward(ctx, grad_output):
|
||||
t1, t2 = ctx.saved_tensors
|
||||
is_t1_diagonaled = ctx.is_t1_diagonaled
|
||||
mode = ctx.mode
|
||||
if is_t1_diagonaled:
|
||||
grad_t1 = SlidingChunk2D.slidingchunk_qk(grad_output, t2, mode)
|
||||
grad_t2 = SlidingChunk2D.slidingchunk_agrad(t1, grad_output, mode)
|
||||
else:
|
||||
grad_t1 = SlidingChunk2D.slidingchunk_av(grad_output, t2, mode)
|
||||
grad_t2 = SlidingChunk2D.slidingchunk_agrad(grad_output, t1, mode)
|
||||
return grad_t1, grad_t2, None, None
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _get_invalid_locations_mask_cyclic(nx: int, ny: int, padx: int, pady: int,
|
||||
w: int, device: str):
|
||||
w2 = w ** 2
|
||||
mask = torch.BoolTensor([
|
||||
[
|
||||
(i // ny + (j // w2) // 3 == nx and
|
||||
(nx - 1) * w + (j % w2) // w >= nx * w - padx) or
|
||||
(i % ny + (j // w2) % 3 == ny and
|
||||
(ny - 1) * w + (j % w2) % w >= ny * w - pady)
|
||||
for j in range(9 * w2)
|
||||
]
|
||||
for i in range(nx * ny)
|
||||
], device='cpu')
|
||||
|
||||
# We should count the w2 in the query here
|
||||
num_invalid = w2 * mask.sum()
|
||||
|
||||
return mask.to(device), num_invalid.to(device)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _get_invalid_locations_mask_zero(nx: int, ny: int, padx: int, pady: int,
|
||||
w: int, device: str):
|
||||
w2 = w ** 2
|
||||
mask = torch.BoolTensor([
|
||||
[
|
||||
i // ny + (j // w2) // 3 - 1 < 0 or
|
||||
i // ny + (j // w2) // 3 - 1 >= nx or
|
||||
(i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w >= nx * w - padx or
|
||||
i % ny + (j // w2) % 3 - 1 < 0 or
|
||||
i % ny + (j // w2) % 3 - 1 >= ny or
|
||||
(i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w >= ny * w - pady
|
||||
for j in range(9 * w2)
|
||||
]
|
||||
for i in range(nx * ny)
|
||||
], device='cpu')
|
||||
|
||||
# We should count the w2 in the query here
|
||||
num_invalid = w2 * mask.sum()
|
||||
|
||||
return mask.to(device), num_invalid.to(device)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _get_invalid_locations_mask_exact(nx: int, ny: int, padx: int, pady: int,
|
||||
w: int, device: str):
|
||||
w2 = w ** 2
|
||||
nx_max = nx * w - 1 - padx
|
||||
ny_max = ny * w - 1 - pady
|
||||
mask = torch.BoolTensor([
|
||||
[
|
||||
[
|
||||
(i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w < max(0, (
|
||||
i // ny - 1) * w + l // w) or
|
||||
(i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w > min(
|
||||
nx_max, (i // ny + 1) * w + l // w) or
|
||||
(i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w < max(0, (
|
||||
i % ny - 1) * w + l % w) or
|
||||
(i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w > min(
|
||||
ny_max, (i % ny + 1) * w + l % w)
|
||||
for j in range(9 * w2)
|
||||
]
|
||||
for l in range(w2)
|
||||
]
|
||||
for i in range(nx * ny)
|
||||
], device='cpu')
|
||||
num_invalid = mask.sum()
|
||||
|
||||
return mask.to(device), num_invalid.to(device)
|
||||
|
||||
|
||||
def mask_invalid_locations(input_tensor: torch.Tensor, nx: int, ny: int,
|
||||
padx: int, pady: int, w: int,
|
||||
exact: int, mode: int = 0) -> torch.Tensor:
|
||||
"""exact
|
||||
1: exact sliding window
|
||||
0: blockwise sliding chunk with zero padding
|
||||
-1: blockwise sliding chunk with cyclic padding
|
||||
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
|
||||
"""
|
||||
w2 = w ** 2
|
||||
if exact == 1 and mode == 0:
|
||||
mask, num_invalid = _get_invalid_locations_mask_exact(
|
||||
nx, ny, padx, pady, w, input_tensor.device)
|
||||
mask = mask.view(1, nx, ny, w2, -1).expand(input_tensor.size())
|
||||
else:
|
||||
if exact == 0:
|
||||
mask, num_invalid = _get_invalid_locations_mask_zero(
|
||||
nx, ny, padx, pady, w, input_tensor.device)
|
||||
elif exact == -1:
|
||||
mask, num_invalid = _get_invalid_locations_mask_cyclic(
|
||||
nx, ny, padx, pady, w, input_tensor.device)
|
||||
else:
|
||||
raise ValueError("longsc exact should be in [0,1,-1]!")
|
||||
if mode == -1:
|
||||
mask = mask[:, 4 * w2:5 * w2]
|
||||
num_invalid = w2 * mask.sum()
|
||||
elif mode > 0:
|
||||
chunk_id = mode if mode > 4 else mode - 1
|
||||
mask = torch.cat([
|
||||
mask[:, 4 * w2:5 * w2],
|
||||
mask[:, chunk_id * w2:(chunk_id+1) * w2],
|
||||
], dim=-1)
|
||||
num_invalid = w2 * mask.sum()
|
||||
mask = mask.view(1, nx, ny, 1, -1).expand(input_tensor.size())
|
||||
input_tensor.masked_fill_(mask, -float('inf'))
|
||||
|
||||
return num_invalid
|
||||
|
||||
|
||||
def slidingchunk_2dautograd(t1: torch.Tensor, t2: torch.Tensor,
|
||||
is_t1_diagonaled: bool = False, mode: int = 0) -> torch.Tensor:
|
||||
if is_t1_diagonaled:
|
||||
return SlidingChunk2D.slidingchunk_av(t1, t2, mode)
|
||||
else:
|
||||
return SlidingChunk2D.slidingchunk_qk(t1, t2, mode)
|
||||
|
||||
|
||||
slidingchunk_2d = SlidingChunk2D.apply
|
|
@ -35,8 +35,8 @@ class BalancedPositiveNegativeSampler(object):
|
|||
pos_idx = []
|
||||
neg_idx = []
|
||||
for matched_idxs_per_image in matched_idxs:
|
||||
positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
|
||||
negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
|
||||
positive = torch.nonzero(matched_idxs_per_image >= 1, as_tuple=False).squeeze(1)
|
||||
negative = torch.nonzero(matched_idxs_per_image == 0, as_tuple=False).squeeze(1)
|
||||
|
||||
num_pos = int(self.batch_size_per_image * self.positive_fraction)
|
||||
# protect against not enough positive examples
|
||||
|
|
|
@ -6,6 +6,7 @@ Implements the Generalized R-CNN framework
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from maskrcnn_benchmark.structures.bounding_box import BoxList
|
||||
from maskrcnn_benchmark.structures.image_list import to_image_list
|
||||
|
||||
from ..backbone import build_backbone
|
||||
|
@ -29,6 +30,7 @@ class GeneralizedRCNN(nn.Module):
|
|||
self.backbone = build_backbone(cfg)
|
||||
self.rpn = build_rpn(cfg, self.backbone.out_channels)
|
||||
self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)
|
||||
self.force_boxes = cfg.MODEL.RPN.FORCE_BOXES
|
||||
|
||||
def forward(self, images, targets=None):
|
||||
"""
|
||||
|
@ -45,9 +47,30 @@ class GeneralizedRCNN(nn.Module):
|
|||
"""
|
||||
if self.training and targets is None:
|
||||
raise ValueError("In training mode, targets should be passed")
|
||||
if self.force_boxes and targets is None:
|
||||
# note targets cannot be None but could have 0 box.
|
||||
raise ValueError("In force_boxes setting, targets should be passed")
|
||||
images = to_image_list(images)
|
||||
features = self.backbone(images.tensors)
|
||||
proposals, proposal_losses = self.rpn(images, features, targets)
|
||||
|
||||
if targets:
|
||||
targets = [target.to(self.device)
|
||||
for target in targets if target is not None]
|
||||
|
||||
if self.force_boxes:
|
||||
proposals = [BoxList(target.bbox, target.size, target.mode)
|
||||
for target in targets]
|
||||
if self.training:
|
||||
# note we still need to compute a loss using all rpn
|
||||
# named parameters, otherwise it will
|
||||
# give unused_parameters error in distributed training.
|
||||
null_loss = 0
|
||||
for key, param in self.rpn.named_parameters():
|
||||
null_loss += 0.0 * param.sum()
|
||||
proposal_losses = {'rpn_null_loss', null_loss}
|
||||
else:
|
||||
proposals, proposal_losses = self.rpn(images, features, targets)
|
||||
|
||||
if self.roi_heads:
|
||||
x, result, detector_losses = self.roi_heads(features, proposals, targets)
|
||||
else:
|
||||
|
|
|
@ -101,7 +101,7 @@ class Matcher(object):
|
|||
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
|
||||
# Find highest quality match available, even if it is low, including ties
|
||||
gt_pred_pairs_of_highest_quality = torch.nonzero(
|
||||
match_quality_matrix == highest_quality_foreach_gt[:, None]
|
||||
match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=False
|
||||
)
|
||||
# Example gt_pred_pairs_of_highest_quality:
|
||||
# tensor([[ 0, 39796],
|
||||
|
|
|
@ -114,7 +114,7 @@ class Pooler(nn.Module):
|
|||
device=device,
|
||||
)
|
||||
for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)):
|
||||
idx_in_level = torch.nonzero(levels == level).squeeze(1)
|
||||
idx_in_level = torch.nonzero(levels == level, as_tuple=False).squeeze(1)
|
||||
rois_per_level = rois[idx_in_level]
|
||||
result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype)
|
||||
|
||||
|
|
|
@ -201,7 +201,7 @@ class PostProcessor(nn.Module):
|
|||
inds_all = scores > self.score_thresh
|
||||
boxlist_empty = self.prepare_empty_boxlist(boxlist)
|
||||
for j in range(1, num_classes):
|
||||
inds = inds_all[:, j].nonzero().squeeze(1)
|
||||
inds = inds_all[:, j].nonzero(as_tuple=False).squeeze(1)
|
||||
|
||||
if len(inds)>0:
|
||||
scores_j = scores[inds, j]
|
||||
|
@ -239,7 +239,7 @@ class PostProcessor(nn.Module):
|
|||
cls_scores.cpu(), number_of_detections - self.detections_per_img + 1
|
||||
)
|
||||
keep = cls_scores >= image_thresh.item()
|
||||
keep = torch.nonzero(keep).squeeze(1)
|
||||
keep = torch.nonzero(keep, as_tuple=False).squeeze(1)
|
||||
result = result[keep]
|
||||
return result
|
||||
|
||||
|
@ -273,7 +273,7 @@ class PostProcessor(nn.Module):
|
|||
|
||||
# filter duplicate boxes
|
||||
scores_pre, labels_pre = dists_all.max(1)
|
||||
inds_pre = scores_pre.nonzero()
|
||||
inds_pre = scores_pre.nonzero(as_tuple=False)
|
||||
assert inds_pre.dim() != 0
|
||||
inds_pre = inds_pre.squeeze(1)
|
||||
|
||||
|
@ -331,7 +331,7 @@ class PostProcessor(nn.Module):
|
|||
hs = (y2 - y1).squeeze(1)
|
||||
keep = (
|
||||
(ws >= 0) & (hs >= 0) & (scores > self.score_thresh * 0.01)
|
||||
).nonzero().squeeze(1)
|
||||
).nonzero(as_tuple=False).squeeze(1)
|
||||
del ws, hs
|
||||
|
||||
# apply nms to the previous low-thresholded results
|
||||
|
|
|
@ -118,7 +118,7 @@ class FastRCNNLossComputation(object):
|
|||
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
|
||||
zip(sampled_pos_inds, sampled_neg_inds)
|
||||
):
|
||||
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
|
||||
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img, as_tuple=False).squeeze(1)
|
||||
proposals_per_image = proposals[img_idx][img_sampled_inds]
|
||||
proposals[img_idx] = proposals_per_image
|
||||
|
||||
|
@ -182,7 +182,7 @@ class FastRCNNLossComputation(object):
|
|||
# get indices that correspond to the regression targets for
|
||||
# the corresponding ground truth labels, to be used with
|
||||
# advanced indexing
|
||||
sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)
|
||||
sampled_pos_inds_subset = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
|
||||
labels_pos = labels[sampled_pos_inds_subset]
|
||||
if self.cls_agnostic_bbox_reg:
|
||||
map_inds = torch.tensor([4, 5, 6, 7], device=device)
|
||||
|
|
|
@ -9,6 +9,7 @@ from maskrcnn_benchmark.modeling.backbone import resnet
|
|||
from maskrcnn_benchmark.modeling.poolers import Pooler
|
||||
from maskrcnn_benchmark.modeling.make_layers import group_norm
|
||||
from maskrcnn_benchmark.modeling.make_layers import make_fc
|
||||
from maskrcnn_benchmark.modeling.backbone.msvit import ViTHead
|
||||
|
||||
|
||||
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor")
|
||||
|
@ -158,6 +159,41 @@ class FPNXconv1fcFeatureExtractor(nn.Module):
|
|||
return x
|
||||
|
||||
|
||||
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ViTHeadFeatureExtractor")
|
||||
class ViTHeadFeatureExtractor(nn.Module):
|
||||
def __init__(self, config, in_channels):
|
||||
super(ViTHeadFeatureExtractor, self).__init__()
|
||||
|
||||
resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
|
||||
scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES
|
||||
sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
|
||||
pooler = Pooler(
|
||||
output_size=(resolution, resolution),
|
||||
scales=scales,
|
||||
sampling_ratio=sampling_ratio,
|
||||
)
|
||||
|
||||
# VIT head
|
||||
args = dict(
|
||||
input_size=config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION,
|
||||
drop_rate=config.MODEL.TRANSFORMER.DROP,
|
||||
drop_path_rate=config.MODEL.TRANSFORMER.DROP_PATH,
|
||||
norm_embed=config.MODEL.TRANSFORMER.NORM_EMBED,
|
||||
layer_cfgstr=config.MODEL.TRANSFORMER.VITHEADARCH,
|
||||
ln_eps=config.MODEL.TRANSFORMER.MSVIT.LN_EPS,
|
||||
)
|
||||
head = ViTHead(in_dim=in_channels, **args)
|
||||
|
||||
self.pooler = pooler
|
||||
self.head = head
|
||||
self.out_channels = head.out_channels
|
||||
|
||||
def forward(self, x, proposals):
|
||||
x = self.pooler(x, proposals)
|
||||
x = self.head(x)
|
||||
return x
|
||||
|
||||
|
||||
def make_roi_box_feature_extractor(cfg, in_channels):
|
||||
func = registry.ROI_BOX_FEATURE_EXTRACTORS[
|
||||
cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR
|
||||
|
|
|
@ -135,7 +135,7 @@ class KeypointRCNNLossComputation(object):
|
|||
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
|
||||
zip(sampled_pos_inds, sampled_neg_inds)
|
||||
):
|
||||
img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1)
|
||||
img_sampled_inds = torch.nonzero(pos_inds_img, as_tuple=False).squeeze(1)
|
||||
proposals_per_image = proposals[img_idx][img_sampled_inds]
|
||||
proposals[img_idx] = proposals_per_image
|
||||
|
||||
|
@ -155,7 +155,7 @@ class KeypointRCNNLossComputation(object):
|
|||
|
||||
keypoint_targets = cat(heatmaps, dim=0)
|
||||
valid = cat(valid, dim=0).to(dtype=torch.bool)
|
||||
valid = torch.nonzero(valid).squeeze(1)
|
||||
valid = torch.nonzero(valid, as_tuple=False).squeeze(1)
|
||||
|
||||
# torch.mean (in binary_cross_entropy_with_logits) does'nt
|
||||
# accept empty tensors, so handle it sepaartely
|
||||
|
|
|
@ -83,7 +83,7 @@ class MaskRCNNLossComputation(object):
|
|||
labels_per_image[neg_inds] = 0
|
||||
|
||||
# mask scores are only computed on positive samples
|
||||
positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1)
|
||||
positive_inds = torch.nonzero(labels_per_image > 0, as_tuple=False).squeeze(1)
|
||||
|
||||
segmentation_masks = matched_targets.get_field("masks")
|
||||
segmentation_masks = segmentation_masks[positive_inds]
|
||||
|
@ -114,7 +114,7 @@ class MaskRCNNLossComputation(object):
|
|||
labels = cat(labels, dim=0)
|
||||
mask_targets = cat(mask_targets, dim=0)
|
||||
|
||||
positive_inds = torch.nonzero(labels > 0).squeeze(1)
|
||||
positive_inds = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
|
||||
labels_pos = labels[positive_inds]
|
||||
|
||||
# torch.mean (in binary_cross_entropy_with_logits) doesn't
|
||||
|
|
|
@ -27,7 +27,7 @@ def keep_only_positive_boxes(boxes):
|
|||
for boxes_per_image in boxes:
|
||||
labels = boxes_per_image.get_field("labels")
|
||||
inds_mask = labels > 0
|
||||
inds = inds_mask.nonzero().squeeze(1)
|
||||
inds = inds_mask.nonzero(as_tuple=False).squeeze(1)
|
||||
positive_boxes.append(boxes_per_image[inds])
|
||||
positive_inds.append(inds_mask)
|
||||
return positive_boxes, positive_inds
|
||||
|
|
|
@ -104,8 +104,8 @@ class RPNLossComputation(object):
|
|||
anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
|
||||
labels, regression_targets = self.prepare_targets(anchors, targets)
|
||||
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
|
||||
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
|
||||
sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
|
||||
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0), as_tuple=False).squeeze(1)
|
||||
sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0), as_tuple=False).squeeze(1)
|
||||
|
||||
sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)
|
||||
|
||||
|
|
|
@ -103,7 +103,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
|
|||
per_box_cls.topk(per_pre_nms_top_n, sorted=False)
|
||||
|
||||
per_candidate_nonzeros = \
|
||||
per_candidate_inds.nonzero()[top_k_indices, :]
|
||||
per_candidate_inds.nonzero(as_tuple=False)[top_k_indices, :]
|
||||
|
||||
per_box_loc = per_candidate_nonzeros[:, 0]
|
||||
per_class = per_candidate_nonzeros[:, 1]
|
||||
|
@ -138,7 +138,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
|
|||
result = []
|
||||
# skip the background
|
||||
for j in range(1, self.num_classes):
|
||||
inds = (labels == j).nonzero().view(-1)
|
||||
inds = (labels == j).nonzero(as_tuple=False).view(-1)
|
||||
|
||||
scores_j = scores[inds]
|
||||
boxes_j = boxes[inds, :].view(-1, 4)
|
||||
|
@ -167,7 +167,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
|
|||
number_of_detections - self.fpn_post_nms_top_n + 1
|
||||
)
|
||||
keep = cls_scores >= image_thresh.item()
|
||||
keep = torch.nonzero(keep).squeeze(1)
|
||||
keep = torch.nonzero(keep, as_tuple=False).squeeze(1)
|
||||
result = result[keep]
|
||||
results.append(result)
|
||||
return results
|
||||
|
|
|
@ -61,7 +61,7 @@ class RetinaNetLossComputation(RPNLossComputation):
|
|||
|
||||
labels = torch.cat(labels, dim=0)
|
||||
regression_targets = torch.cat(regression_targets, dim=0)
|
||||
pos_inds = torch.nonzero(labels > 0).squeeze(1)
|
||||
pos_inds = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
|
||||
|
||||
retinanet_regression_loss = smooth_l1_loss(
|
||||
box_regression[pos_inds],
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
from .build import make_optimizer
|
||||
from .build import make_optimizer, make_optimizer_d2
|
||||
from .build import make_lr_scheduler
|
||||
from .lr_scheduler import WarmupMultiStepLR
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
import itertools
|
||||
import torch
|
||||
|
||||
from .lr_scheduler import WarmupMultiStepLR
|
||||
|
@ -20,6 +21,65 @@ def make_optimizer(cfg, model):
|
|||
return optimizer
|
||||
|
||||
|
||||
def make_optimizer_d2(cfg, model):
|
||||
# default no decay parameters for resnets
|
||||
no_decay = ['bn.bias', 'bn.weight', 'bn1.bias', 'bn1.weight',
|
||||
'bn2.bias', 'bn2.weight', 'bn3.bias', 'bn3.weight']
|
||||
if hasattr(model.backbone.body, 'no_weight_decay'):
|
||||
no_decay = list(model.backbone.body.no_weight_decay())
|
||||
|
||||
params = []
|
||||
memo = set()
|
||||
for key, value in model.named_parameters(recurse=True):
|
||||
if not value.requires_grad:
|
||||
continue
|
||||
# Avoid duplicating parameters
|
||||
if value in memo:
|
||||
continue
|
||||
memo.add(value)
|
||||
lr = cfg.SOLVER.BASE_LR
|
||||
weight_decay = cfg.SOLVER.WEIGHT_DECAY
|
||||
if "bias" in key:
|
||||
lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
|
||||
weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
|
||||
|
||||
if any(nd in key for nd in no_decay):
|
||||
weight_decay = 0.0
|
||||
|
||||
params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
|
||||
|
||||
def maybe_add_full_model_gradient_clipping(optim): # optim: the optimizer class
|
||||
# detectron2 doesn't have full model gradient clipping now
|
||||
clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
|
||||
enable = (
|
||||
cfg.SOLVER.CLIP_GRADIENTS.ENABLED
|
||||
and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
|
||||
and clip_norm_val > 0.0
|
||||
)
|
||||
|
||||
class FullModelGradientClippingOptimizer(optim):
|
||||
def step(self, closure=None):
|
||||
all_params = itertools.chain(*[x["params"] for x in self.param_groups])
|
||||
torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
|
||||
super().step(closure=closure)
|
||||
|
||||
return FullModelGradientClippingOptimizer if enable else optim
|
||||
|
||||
optimizer_type = cfg.SOLVER.OPTIMIZER
|
||||
if optimizer_type == "SGD":
|
||||
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
|
||||
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
|
||||
)
|
||||
elif optimizer_type == "ADAMW":
|
||||
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
|
||||
params, cfg.SOLVER.BASE_LR
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"no optimizer type {optimizer_type}")
|
||||
|
||||
return optimizer
|
||||
|
||||
|
||||
def make_lr_scheduler(cfg, optimizer):
|
||||
return WarmupMultiStepLR(
|
||||
optimizer,
|
||||
|
|
|
@ -45,7 +45,7 @@ def remove_small_boxes(boxlist, min_size):
|
|||
_, _, ws, hs = xywh_boxes.unbind(dim=1)
|
||||
keep = (
|
||||
(ws >= min_size) & (hs >= min_size)
|
||||
).nonzero().squeeze(1)
|
||||
).nonzero(as_tuple=False).squeeze(1)
|
||||
return boxlist[keep]
|
||||
|
||||
|
||||
|
|
|
@ -457,7 +457,7 @@ class PolygonList(object):
|
|||
# advanced indexing on a single dimension
|
||||
selected_polygons = []
|
||||
if isinstance(item, torch.Tensor) and item.dtype == torch.bool:
|
||||
item = item.nonzero()
|
||||
item = item.nonzero(as_tuple=False)
|
||||
item = item.squeeze(1) if item.numel() > 0 else item
|
||||
item = item.tolist()
|
||||
for i in item:
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
from contextlib import contextmanager
|
||||
|
||||
@contextmanager
|
||||
def nullcontext(enter_result=None, **kwargs):
|
||||
yield enter_result
|
||||
|
||||
try:
|
||||
from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd
|
||||
except:
|
||||
print('[Warning] Library for automatic mixed precision is not found, AMP is disabled!!')
|
||||
GradScaler = nullcontext
|
||||
autocast = nullcontext
|
||||
custom_fwd = nullcontext
|
||||
custom_bwd = nullcontext
|
|
@ -1,13 +1,38 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
from collections import OrderedDict
|
||||
import logging
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from maskrcnn_benchmark.utils.imports import import_file
|
||||
|
||||
|
||||
def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
|
||||
def resize_pos_embed_1d(posemb, shape_new):
|
||||
# Rescale the grid of position embeddings when loading from state_dict.
|
||||
ntok_old = posemb.shape[1]
|
||||
if ntok_old > 1:
|
||||
ntok_new = shape_new[1]
|
||||
posemb_grid = posemb.permute(0, 2, 1).unsqueeze(dim=-1)
|
||||
posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=[ntok_new, 1], mode='bilinear')
|
||||
posemb_grid = posemb_grid.squeeze(dim=-1).permute(0, 2, 1)
|
||||
posemb = posemb_grid
|
||||
return posemb
|
||||
|
||||
|
||||
def resize_pos_embed_2d(posemb, shape_new):
|
||||
# Rescale the grid of position embeddings when loading from state_dict. Adapted from
|
||||
# https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
|
||||
ntok_new = shape_new[0]
|
||||
gs_old = int(math.sqrt(len(posemb))) # 2 * w - 1
|
||||
gs_new = int(math.sqrt(ntok_new)) # 2 * w - 1
|
||||
posemb_grid = posemb.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
|
||||
posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear')
|
||||
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(gs_new * gs_new, -1)
|
||||
return posemb_grid
|
||||
|
||||
|
||||
def align_and_update_state_dicts(model_state_dict, loaded_state_dict, skip_unmatched_layers=True):
|
||||
"""
|
||||
Strategy: suppose that the models that we will create will have prefixes appended
|
||||
to each of its keys, for example due to an extra level of nesting that the original
|
||||
|
@ -41,11 +66,47 @@ def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
|
|||
max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
|
||||
log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
|
||||
logger = logging.getLogger(__name__)
|
||||
# print out no match
|
||||
uninitialized_keys = [current_keys[idx_new] for idx_new, idx_old in enumerate(idxs.tolist()) if idx_old == -1]
|
||||
logger.info("Parameters not initialized from checkpoint: {}\n".format(
|
||||
','.join(uninitialized_keys)
|
||||
))
|
||||
for idx_new, idx_old in enumerate(idxs.tolist()):
|
||||
if idx_old == -1:
|
||||
continue
|
||||
key = current_keys[idx_new]
|
||||
key_old = loaded_keys[idx_old]
|
||||
if model_state_dict[key].shape != loaded_state_dict[
|
||||
key_old].shape and skip_unmatched_layers:
|
||||
if 'x_pos_embed' in key or 'y_pos_embed' in key:
|
||||
shape_old = loaded_state_dict[key_old].shape
|
||||
shape_new = model_state_dict[key].shape
|
||||
new_val = resize_pos_embed_1d(loaded_state_dict[key_old],
|
||||
shape_new)
|
||||
if shape_new == new_val.shape:
|
||||
model_state_dict[key] = new_val
|
||||
logger.info("[RESIZE] {} {} -> {} {}".format(
|
||||
key_old, shape_old, key, shape_new))
|
||||
else:
|
||||
logger.info("[WARNING]", "{} {} != {} {}, skip".format(
|
||||
key_old, new_val.shape, key, shape_new))
|
||||
elif 'local_relative_position_bias_table' in key:
|
||||
shape_old = loaded_state_dict[key_old].shape
|
||||
shape_new = model_state_dict[key].shape
|
||||
new_val = resize_pos_embed_2d(loaded_state_dict[key_old],
|
||||
shape_new)
|
||||
if shape_new == new_val.shape:
|
||||
model_state_dict[key] = new_val
|
||||
logger.info("[RESIZE] {} {} -> {} {}".format(
|
||||
key_old, shape_old, key, shape_new))
|
||||
else:
|
||||
logger.info("[WARNING]", "{} {} != {} {}, skip".format(
|
||||
key_old, new_val.shape, key, shape_new))
|
||||
else:
|
||||
# if layer weights does not match in size, skip this layer
|
||||
logger.info(
|
||||
"SKIPPING LAYER {} because of size mis-match".format(key))
|
||||
continue
|
||||
model_state_dict[key] = loaded_state_dict[key_old]
|
||||
logger.info(
|
||||
log_str_template.format(
|
||||
|
|
|
@ -5,6 +5,7 @@ Implements the FRCNN with Attribute Head
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
from maskrcnn_benchmark.structures.bounding_box import BoxList
|
||||
from maskrcnn_benchmark.structures.image_list import to_image_list
|
||||
from maskrcnn_benchmark.modeling.detector.generalized_rcnn import \
|
||||
GeneralizedRCNN
|
||||
|
@ -56,6 +57,9 @@ class AttrRCNN(GeneralizedRCNN):
|
|||
"""
|
||||
if self.training and targets is None:
|
||||
raise ValueError("In training mode, targets should be passed")
|
||||
if self.force_boxes and targets is None:
|
||||
# note targets cannot be None but could have 0 box.
|
||||
raise ValueError("In force_boxes setting, targets should be passed")
|
||||
|
||||
images = to_image_list(images)
|
||||
images = images.to(self.device)
|
||||
|
@ -65,7 +69,20 @@ class AttrRCNN(GeneralizedRCNN):
|
|||
targets = [target.to(self.device)
|
||||
for target in targets if target is not None]
|
||||
|
||||
proposals, proposal_losses = self.rpn(images, features, targets)
|
||||
if self.force_boxes:
|
||||
proposals = [BoxList(target.bbox, target.size, target.mode)
|
||||
for target in targets]
|
||||
if self.training:
|
||||
# note we still need to compute a loss using all rpn
|
||||
# named parameters, otherwise it will
|
||||
# give unused_parameters error in distributed training.
|
||||
null_loss = 0
|
||||
for key, param in self.rpn.named_parameters():
|
||||
null_loss += 0.0 * param.sum()
|
||||
proposal_losses = {'rpn_null_loss', null_loss}
|
||||
else:
|
||||
proposals, proposal_losses = self.rpn(images, features, targets)
|
||||
|
||||
x, predictions, detector_losses = self.roi_heads(features,
|
||||
proposals, targets)
|
||||
|
||||
|
|
|
@ -48,9 +48,9 @@ class AttributeRCNNLossComputation(object):
|
|||
# prepare attribute targets
|
||||
sim_attributes = attribute_logits.new(attribute_logits.size()).zero_()
|
||||
for i in range(len(attributes)):
|
||||
if len(torch.nonzero(attributes[i])) > 0:
|
||||
sim_attributes[i][attributes[i][torch.nonzero(attributes[i])].long()] = 1.0 / len(
|
||||
torch.nonzero(attributes[i]))
|
||||
if len(torch.nonzero(attributes[i], as_tuple=False)) > 0:
|
||||
sim_attributes[i][attributes[i][torch.nonzero(attributes[i], as_tuple=False)].long()] = 1.0 / len(
|
||||
torch.nonzero(attributes[i], as_tuple=False))
|
||||
# TODO: do we need to ignore the all zero vector?
|
||||
attribute_loss = self.cross_entropy(attribute_logits, sim_attributes, loss_type="softmax")
|
||||
|
||||
|
|
|
@ -15,6 +15,10 @@ registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS.register(
|
|||
"FPNXconv1fcFeatureExtractor", FPNXconv1fcFeatureExtractor
|
||||
)
|
||||
|
||||
registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS.register(
|
||||
"ViTHeadFeatureExtractor", ViTHeadFeatureExtractor
|
||||
)
|
||||
|
||||
|
||||
def make_roi_attribute_feature_extractor(cfg, in_channels):
|
||||
func = registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS[
|
||||
|
|
|
@ -34,8 +34,8 @@ class BalancedPositiveNegativePairSampler(object):
|
|||
pos_idx = []
|
||||
neg_idx = []
|
||||
for matched_idxs_per_image in matched_idxs:
|
||||
positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
|
||||
negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
|
||||
positive = torch.nonzero(matched_idxs_per_image >= 1, as_tuple=False).squeeze(1)
|
||||
negative = torch.nonzero(matched_idxs_per_image == 0, as_tuple=False).squeeze(1)
|
||||
|
||||
num_pos = int(self.batch_size_per_image * self.positive_fraction)
|
||||
# protect against not enough positive examples
|
||||
|
|
|
@ -55,7 +55,7 @@ class FastRCNNLossComputation(object):
|
|||
match_j = match_quality_matrix[j].view(1, -1)
|
||||
match_ij = ((match_i + match_j) / 2)
|
||||
# rmeove duplicate index
|
||||
non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero().view(-1).to(match_ij.device)
|
||||
non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero(as_tuple=False).view(-1).to(match_ij.device)
|
||||
match_ij = match_ij.view(-1) # [::match_quality_matrix.shape[1]] = 0
|
||||
match_ij = match_ij[non_duplicate_idx]
|
||||
temp.append(match_ij)
|
||||
|
@ -79,7 +79,7 @@ class FastRCNNLossComputation(object):
|
|||
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposal.bbox.device)
|
||||
proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
|
||||
|
||||
non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero()
|
||||
non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False)
|
||||
proposal_box_pairs = proposal_box_pairs[non_duplicate_idx.view(-1)]
|
||||
proposal_idx_pairs = proposal_idx_pairs[non_duplicate_idx.view(-1)]
|
||||
proposal_pairs = BoxPairList(proposal_box_pairs, proposal.size, proposal.mode)
|
||||
|
@ -167,7 +167,7 @@ class FastRCNNLossComputation(object):
|
|||
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
|
||||
zip(sampled_pos_inds, sampled_neg_inds)
|
||||
):
|
||||
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
|
||||
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img, as_tuple=False).squeeze(1)
|
||||
proposal_pairs_per_image = proposal_pairs[img_idx][img_sampled_inds]
|
||||
proposal_pairs[img_idx] = proposal_pairs_per_image
|
||||
|
||||
|
@ -245,13 +245,13 @@ class FastRCNNLossComputation(object):
|
|||
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals[0].bbox.device)
|
||||
proposal_idx_pairs_per_image = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
|
||||
|
||||
keep_idx = (proposal_idx_pairs_per_image[:, 0] != proposal_idx_pairs_per_image[:, 1]).nonzero().view(-1)
|
||||
keep_idx = (proposal_idx_pairs_per_image[:, 0] != proposal_idx_pairs_per_image[:, 1]).nonzero(as_tuple=False).view(-1)
|
||||
|
||||
# if we filter non overlap bounding boxes
|
||||
if cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
|
||||
ious = boxlist_iou(proposals[0], proposals[0]).view(-1)
|
||||
ious = ious[keep_idx]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
|
||||
# proposal_idx_pairs_per_image = proposal_idx_pairs_per_image[keep_idx]
|
||||
proposal_box_pairs_per_image = proposal_box_pairs_per_image[keep_idx]
|
||||
proposal_box_pairs.append(proposal_box_pairs_per_image)
|
||||
|
@ -361,7 +361,7 @@ class FastRCNNLossComputation(object):
|
|||
labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
|
||||
|
||||
# import pdb; pdb.set_trace()
|
||||
rel_fg_cnt = len(labels.nonzero())
|
||||
rel_fg_cnt = len(labels.nonzero(as_tuple=False))
|
||||
rel_bg_cnt = labels.shape[0] - rel_fg_cnt
|
||||
ce_weights = labels.new(class_logits.size(1)).fill_(1).float()
|
||||
ce_weights[0] = float(rel_fg_cnt) / (rel_bg_cnt + 1e-5)
|
||||
|
|
|
@ -113,7 +113,7 @@ class MSDN_BASE(nn.Module):
|
|||
requires_grad=True).type_as(target_features)
|
||||
feature_data.append(temp)
|
||||
else:
|
||||
transfer_list = (select_mat.data > 0).nonzero()
|
||||
transfer_list = (select_mat.data > 0).nonzero(as_tuple=False)
|
||||
source_indices = Variable(transfer_list[:, 1])
|
||||
target_indices = Variable(transfer_list[:, 0])
|
||||
source_f = torch.index_select(source_features, 0, source_indices)
|
||||
|
@ -122,7 +122,7 @@ class MSDN_BASE(nn.Module):
|
|||
|
||||
for f_id in range(target_features.size()[0]):
|
||||
if select_mat[f_id, :].data.sum() > 0:
|
||||
feature_indices = (transfer_list[:, 0] == f_id).nonzero()[0]
|
||||
feature_indices = (transfer_list[:, 0] == f_id).nonzero(as_tuple=False)[0]
|
||||
indices = Variable(feature_indices)
|
||||
features = torch.index_select(transferred_features, 0,
|
||||
indices).mean(0).view(-1)
|
||||
|
|
|
@ -93,7 +93,7 @@ class PairMatcher(object):
|
|||
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
|
||||
# Find highest quality match available, even if it is low, including ties
|
||||
gt_pred_pairs_of_highest_quality = torch.nonzero(
|
||||
match_quality_matrix == highest_quality_foreach_gt[:, None]
|
||||
match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=False
|
||||
)
|
||||
# Example gt_pred_pairs_of_highest_quality:
|
||||
# tensor([[ 0, 39796],
|
||||
|
|
|
@ -98,13 +98,13 @@ class ROIRelationHead(torch.nn.Module):
|
|||
proposal_label_pairs = torch.cat(
|
||||
(label_subj.view(-1, 1), label_obj.view(-1, 1)), 1)
|
||||
|
||||
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
|
||||
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False).view(-1)
|
||||
|
||||
# if we filter non overlap bounding boxes
|
||||
if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
|
||||
ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
|
||||
ious = ious[keep_idx]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
|
||||
proposal_idx_pairs = proposal_idx_pairs[keep_idx]
|
||||
proposal_box_pairs = proposal_box_pairs[keep_idx]
|
||||
proposal_label_pairs = proposal_label_pairs[keep_idx]
|
||||
|
|
|
@ -46,7 +46,7 @@ class RelPN(nn.Module):
|
|||
match_ij = ((match_i + match_j) / 2)
|
||||
# rmeove duplicate index
|
||||
match_ij = match_ij.view(-1) # [::match_quality_matrix.shape[1]] = 0
|
||||
# non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero().view(-1).to(match_ij.device)
|
||||
# non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero(as_tuple=False).view(-1).to(match_ij.device)
|
||||
# match_ij = match_ij[non_duplicate_idx]
|
||||
temp.append(match_ij)
|
||||
boxi = target.bbox[i]; boxj = target.bbox[j]
|
||||
|
@ -68,7 +68,7 @@ class RelPN(nn.Module):
|
|||
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposal.bbox.device)
|
||||
proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
|
||||
|
||||
# non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero()
|
||||
# non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False)
|
||||
# proposal_box_pairs = proposal_box_pairs[non_duplicate_idx.view(-1)]
|
||||
# proposal_idx_pairs = proposal_idx_pairs[non_duplicate_idx.view(-1)]
|
||||
|
||||
|
@ -184,13 +184,13 @@ class RelPN(nn.Module):
|
|||
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals_per_image.bbox.device)
|
||||
proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
|
||||
|
||||
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
|
||||
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False).view(-1)
|
||||
|
||||
# if we filter non overlap bounding boxes
|
||||
if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
|
||||
ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
|
||||
ious = ious[keep_idx]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
|
||||
proposal_idx_pairs = proposal_idx_pairs[keep_idx]
|
||||
proposal_box_pairs = proposal_box_pairs[keep_idx]
|
||||
proposal_pairs_per_image = BoxPairList(proposal_box_pairs, proposals_per_image.size, proposals_per_image.mode)
|
||||
|
@ -212,11 +212,11 @@ class RelPN(nn.Module):
|
|||
obj_logits = proposals_per_image.get_field('scores_all')
|
||||
obj_bboxes = proposals_per_image.bbox
|
||||
relness = self.relationshipness(obj_logits, obj_bboxes, proposals_per_image.size)
|
||||
keep_idx = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1).nonzero().view(-1)
|
||||
keep_idx = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1).nonzero(as_tuple=False).view(-1)
|
||||
if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
|
||||
ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
|
||||
ious = ious[keep_idx]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
|
||||
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
|
||||
relness = relness.view(-1)[keep_idx]
|
||||
relness_sorted, order = torch.sort(relness.view(-1), descending=True)
|
||||
|
||||
|
@ -266,7 +266,7 @@ class RelPN(nn.Module):
|
|||
proposals = self._proposal_pairs
|
||||
labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
|
||||
|
||||
rel_fg_cnt = len(labels.nonzero())
|
||||
rel_fg_cnt = len(labels.nonzero(as_tuple=False))
|
||||
rel_bg_cnt = labels.shape[0] - rel_fg_cnt
|
||||
ce_weights = labels.new(class_logits.size(1)).fill_(1).float()
|
||||
ce_weights[0] = float(rel_fg_cnt) / (rel_bg_cnt + 1e-5)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "GeneralizedRCNN"
|
||||
WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
|
||||
WEIGHT: "pretrained_models/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
|
||||
BACKBONE:
|
||||
CONV_BODY: "R-152-FPN"
|
||||
RESNETS:
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "AttrRCNN"
|
||||
WEIGHT: "/mnt/model_storage/msvit/IN22kpretrained/deepbase_relative/model_best.pth"
|
||||
BACKBONE:
|
||||
CONV_BODY: "ViL-C4"
|
||||
TRANSFORMER:
|
||||
DROP: 0.0
|
||||
DROP_PATH: 0.3
|
||||
NORM_EMBED: True
|
||||
OUT_FEATURES: ["layer3"]
|
||||
VITHEADARCH: "l4,h12,d768,n1,s0,g0,p2,f7,a0"
|
||||
MSVIT:
|
||||
ARCH: "l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n8,s1,g1,p2,f7,a0_l3,h6,d384,n24,s1,g1,p2,f7,a0"
|
||||
ATTN_TYPE: longformerhand
|
||||
ONLY_GLOBAL: False
|
||||
SHARE_KV: True
|
||||
SHARE_W: True
|
||||
SW_EXACT: 0
|
||||
RPN:
|
||||
PRE_NMS_TOP_N_TEST: 6000
|
||||
POST_NMS_TOP_N_TEST: 300
|
||||
ROI_HEADS:
|
||||
BATCH_SIZE_PER_IMAGE: 384 # 512
|
||||
POSITIVE_FRACTION: 0.5 # 0.25
|
||||
SCORE_THRESH: 0.05 # 0.0001
|
||||
DETECTIONS_PER_IMG: 100 # 600
|
||||
MIN_DETECTIONS_PER_IMG: 10
|
||||
ROI_BOX_HEAD:
|
||||
NUM_CLASSES: 1595
|
||||
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
|
||||
ROI_ATTRIBUTE_HEAD:
|
||||
NUM_ATTRIBUTES: 525
|
||||
POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
|
||||
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
|
||||
ATTRIBUTE_ON: False
|
||||
INPUT:
|
||||
MIN_SIZE_TEST: 600
|
||||
MAX_SIZE_TEST: 1000
|
||||
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
||||
PIXEL_STD: [58.395, 57.120, 57.375]
|
||||
DATASETS:
|
||||
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
|
||||
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
|
||||
FACTORY_TRAIN: ("VGTSVDataset",)
|
||||
FACTORY_TEST: ("VGTSVDataset",)
|
||||
DATALOADER:
|
||||
NUM_WORKERS: 0
|
||||
SOLVER:
|
||||
BASE_LR: 0.00008
|
||||
WEIGHT_DECAY: 0.05
|
||||
STEPS: (75000, 100000)
|
||||
MAX_ITER: 170000
|
||||
IMS_PER_BATCH: 1
|
||||
CHECKPOINT_PERIOD: 5000
|
||||
OPTIMIZER: "ADAMW"
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "full_model"
|
||||
CLIP_VALUE: 1.0
|
||||
NORM_TYPE: 2.0
|
||||
TEST:
|
||||
IMS_PER_BATCH: 1
|
||||
SKIP_PERFORMANCE_EVAL: False
|
||||
SAVE_PREDICTIONS: True
|
||||
SAVE_RESULTS_TO_TSV: True
|
||||
TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
|
||||
GATHER_ON_CPU: False
|
||||
OUTPUT_DIR: "./output/vilc4_test"
|
||||
DATA_DIR: "./datasets"
|
||||
DISTRIBUTED_BACKEND: 'nccl'
|
|
@ -0,0 +1,70 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "AttrRCNN"
|
||||
WEIGHT: "/mnt/model_storage/msvit/IN22kpretrained/villarge_relative/model_best.pth"
|
||||
BACKBONE:
|
||||
CONV_BODY: "ViL-C4"
|
||||
TRANSFORMER:
|
||||
DROP: 0.0
|
||||
DROP_PATH: 0.5
|
||||
NORM_EMBED: True
|
||||
OUT_FEATURES: ["layer3"]
|
||||
VITHEADARCH: "l4,h24,d1536,n1,s0,g0,p2,f7,a0"
|
||||
MSVIT:
|
||||
ARCH: "l1,h3,d192,n1,s1,g1,p4,f7,a0_l2,h6,d384,n8,s1,g1,p2,f7,a0_l3,h12,d768,n24,s1,g1,p2,f7,a0"
|
||||
ATTN_TYPE: longformerhand
|
||||
ONLY_GLOBAL: False
|
||||
SHARE_KV: True
|
||||
SHARE_W: True
|
||||
SW_EXACT: 0
|
||||
RPN:
|
||||
PRE_NMS_TOP_N_TEST: 6000
|
||||
POST_NMS_TOP_N_TEST: 300
|
||||
ROI_HEADS:
|
||||
BATCH_SIZE_PER_IMAGE: 384 # 512
|
||||
POSITIVE_FRACTION: 0.5 # 0.25
|
||||
SCORE_THRESH: 0.05 # 0.0001
|
||||
DETECTIONS_PER_IMG: 100 # 600
|
||||
MIN_DETECTIONS_PER_IMG: 10
|
||||
ROI_BOX_HEAD:
|
||||
NUM_CLASSES: 1595
|
||||
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
|
||||
ROI_ATTRIBUTE_HEAD:
|
||||
NUM_ATTRIBUTES: 525
|
||||
POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
|
||||
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
|
||||
ATTRIBUTE_ON: False
|
||||
INPUT:
|
||||
MIN_SIZE_TEST: 600
|
||||
MAX_SIZE_TEST: 1000
|
||||
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
||||
PIXEL_STD: [58.395, 57.120, 57.375]
|
||||
DATASETS:
|
||||
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
|
||||
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
|
||||
FACTORY_TRAIN: ("VGTSVDataset",)
|
||||
FACTORY_TEST: ("VGTSVDataset",)
|
||||
DATALOADER:
|
||||
NUM_WORKERS: 0
|
||||
SOLVER:
|
||||
BASE_LR: 0.00008
|
||||
WEIGHT_DECAY: 0.05
|
||||
STEPS: (75000, 100000)
|
||||
MAX_ITER: 170000
|
||||
IMS_PER_BATCH: 1
|
||||
CHECKPOINT_PERIOD: 5000
|
||||
OPTIMIZER: "ADAMW"
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "full_model"
|
||||
CLIP_VALUE: 1.0
|
||||
NORM_TYPE: 2.0
|
||||
TEST:
|
||||
IMS_PER_BATCH: 1
|
||||
SKIP_PERFORMANCE_EVAL: False
|
||||
SAVE_PREDICTIONS: True
|
||||
SAVE_RESULTS_TO_TSV: True
|
||||
TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
|
||||
GATHER_ON_CPU: False
|
||||
OUTPUT_DIR: "./output/vilc4_test"
|
||||
DATA_DIR: "./datasets"
|
||||
DISTRIBUTED_BACKEND: 'nccl'
|
|
@ -0,0 +1,70 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "AttrRCNN"
|
||||
WEIGHT: "/mnt/model_storage/msvit/visionlongformer/longtiny1191_ape0_exact0_nglo1_mode1_swith075/model_best.pth"
|
||||
BACKBONE:
|
||||
CONV_BODY: "ViL-C4"
|
||||
TRANSFORMER:
|
||||
DROP: 0.0
|
||||
DROP_PATH: 0.1
|
||||
NORM_EMBED: True
|
||||
OUT_FEATURES: ["layer3"]
|
||||
VITHEADARCH: "l4,h12,d768,n1,s0,g0,p2,f7,a0"
|
||||
MSVIT:
|
||||
ARCH: "l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n2,s1,g1,p2,f7,a0_l3,h6,d384,n8,s1,g1,p2,f7,a0"
|
||||
ATTN_TYPE: longformerhand
|
||||
ONLY_GLOBAL: False
|
||||
SHARE_KV: True
|
||||
SHARE_W: True
|
||||
SW_EXACT: 0
|
||||
RPN:
|
||||
PRE_NMS_TOP_N_TEST: 6000
|
||||
POST_NMS_TOP_N_TEST: 300
|
||||
ROI_HEADS:
|
||||
BATCH_SIZE_PER_IMAGE: 384 # 512
|
||||
POSITIVE_FRACTION: 0.5 # 0.25
|
||||
SCORE_THRESH: 0.05 # 0.0001
|
||||
DETECTIONS_PER_IMG: 100 # 600
|
||||
MIN_DETECTIONS_PER_IMG: 10
|
||||
ROI_BOX_HEAD:
|
||||
NUM_CLASSES: 1595
|
||||
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
|
||||
ROI_ATTRIBUTE_HEAD:
|
||||
NUM_ATTRIBUTES: 525
|
||||
POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
|
||||
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
|
||||
ATTRIBUTE_ON: False
|
||||
INPUT:
|
||||
MIN_SIZE_TEST: 600
|
||||
MAX_SIZE_TEST: 1000
|
||||
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
||||
PIXEL_STD: [58.395, 57.120, 57.375]
|
||||
DATASETS:
|
||||
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
|
||||
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
|
||||
FACTORY_TRAIN: ("VGTSVDataset",)
|
||||
FACTORY_TEST: ("VGTSVDataset",)
|
||||
DATALOADER:
|
||||
NUM_WORKERS: 0
|
||||
SOLVER:
|
||||
BASE_LR: 0.0001
|
||||
WEIGHT_DECAY: 0.05
|
||||
STEPS: (75000, 100000)
|
||||
MAX_ITER: 170000
|
||||
IMS_PER_BATCH: 1
|
||||
CHECKPOINT_PERIOD: 5000
|
||||
OPTIMIZER: "ADAMW"
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "full_model"
|
||||
CLIP_VALUE: 1.0
|
||||
NORM_TYPE: 2.0
|
||||
TEST:
|
||||
IMS_PER_BATCH: 1
|
||||
SKIP_PERFORMANCE_EVAL: False
|
||||
SAVE_PREDICTIONS: True
|
||||
SAVE_RESULTS_TO_TSV: True
|
||||
TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
|
||||
GATHER_ON_CPU: False
|
||||
OUTPUT_DIR: "./output/vilc4_test"
|
||||
DATA_DIR: "./datasets"
|
||||
DISTRIBUTED_BACKEND: 'nccl'
|
|
@ -28,9 +28,13 @@ INPUT:
|
|||
MAX_SIZE_TEST: 1000
|
||||
PIXEL_MEAN: [103.530, 116.280, 123.675]
|
||||
DATASETS:
|
||||
FACTORY_TEST: ("ODTSVDataset",)
|
||||
TEST: ("flickr30k/tsv/flickr30k.yaml",)
|
||||
# FACTORY_TEST: ("ODTSVDataset",)
|
||||
# TEST: ("flickr30k/tsv/flickr30k.yaml",)
|
||||
LABELMAP_FILE: "visualgenome/VG-SGG-dicts-vgoi6-clipped.json"
|
||||
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
|
||||
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
|
||||
FACTORY_TRAIN: ("VGTSVDataset",)
|
||||
FACTORY_TEST: ("VGTSVDataset",)
|
||||
DATALOADER:
|
||||
NUM_WORKERS: 0
|
||||
SOLVER:
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "GeneralizedRCNN"
|
||||
WEIGHT: "pretrained_model/RX152FPN_reldn_oi_best.pth"
|
||||
# WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
|
||||
USE_FREQ_PRIOR: False
|
||||
FREQ_PRIOR: "openimages_v5c/vrd/vrd_frequency_prior_include_background.npy"
|
||||
BACKBONE:
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "SceneParser"
|
||||
WEIGHT: "pretrained_model/RX152FPN_reldn_oi_best.pth"
|
||||
# WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
|
||||
USE_FREQ_PRIOR: False
|
||||
FREQ_PRIOR: "openimages_v5c/vrd/vrd_frequency_prior_include_background.npy"
|
||||
BACKBONE:
|
||||
|
|
|
@ -18,12 +18,6 @@ from maskrcnn_benchmark.utils.comm import synchronize, get_rank
|
|||
from maskrcnn_benchmark.utils.logger import setup_logger
|
||||
from maskrcnn_benchmark.utils.miscellaneous import mkdir
|
||||
|
||||
# Check if we can enable mixed-precision via apex.amp
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError('Use APEX for mixed precision via apex.amp')
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
|
||||
|
@ -73,10 +67,6 @@ def main():
|
|||
model = build_detection_model(cfg)
|
||||
model.to(cfg.MODEL.DEVICE)
|
||||
|
||||
# Initialize mixed-precision if necessary
|
||||
use_mixed_precision = cfg.DTYPE == 'float16'
|
||||
amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
|
||||
|
||||
output_dir = cfg.OUTPUT_DIR
|
||||
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
|
||||
ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
|
||||
|
|
|
@ -6,6 +6,7 @@ from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:sk
|
|||
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
|
||||
import torch
|
||||
from maskrcnn_benchmark.config import cfg
|
||||
|
@ -21,11 +22,112 @@ from maskrcnn_benchmark.utils.comm import synchronize, get_rank
|
|||
from maskrcnn_benchmark.utils.logger import setup_logger
|
||||
from maskrcnn_benchmark.utils.miscellaneous import mkdir
|
||||
|
||||
# Check if we can enable mixed-precision via apex.amp
|
||||
# try:
|
||||
# from apex import amp
|
||||
# except ImportError:
|
||||
# raise ImportError('Use APEX for mixed precision via apex.amp')
|
||||
|
||||
def run_test(cfg, model, distributed, model_name):
|
||||
if distributed and hasattr(model, 'module'):
|
||||
model = model.module
|
||||
torch.cuda.empty_cache() # TODO check if it helps
|
||||
iou_types = ("bbox",)
|
||||
if cfg.MODEL.MASK_ON:
|
||||
iou_types = iou_types + ("segm",)
|
||||
if cfg.MODEL.KEYPOINT_ON:
|
||||
iou_types = iou_types + ("keypoints",)
|
||||
output_folders = [None] * len(cfg.DATASETS.TEST)
|
||||
dataset_names = cfg.DATASETS.TEST
|
||||
if cfg.OUTPUT_DIR:
|
||||
if len(dataset_names) == 1:
|
||||
output_folder = os.path.join(
|
||||
cfg.OUTPUT_DIR, "inference",
|
||||
os.path.splitext(model_name)[0]
|
||||
)
|
||||
mkdir(output_folder)
|
||||
output_folders = [output_folder]
|
||||
else:
|
||||
for idx, dataset_name in enumerate(dataset_names):
|
||||
dataset_name1 = dataset_name.replace('/', '_')
|
||||
output_folder = os.path.join(
|
||||
cfg.OUTPUT_DIR, "inference",
|
||||
dataset_name1,
|
||||
os.path.splitext(model_name)[0]
|
||||
)
|
||||
mkdir(output_folder)
|
||||
output_folders[idx] = output_folder
|
||||
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
|
||||
labelmap_file = config_dataset_file(cfg.DATA_DIR, cfg.DATASETS.LABELMAP_FILE)
|
||||
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
|
||||
results = inference(
|
||||
model,
|
||||
cfg,
|
||||
data_loader_val,
|
||||
dataset_name=dataset_name,
|
||||
iou_types=iou_types,
|
||||
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
|
||||
bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
|
||||
device=cfg.MODEL.DEVICE,
|
||||
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
||||
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
||||
output_folder=output_folder,
|
||||
skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
|
||||
labelmap_file=labelmap_file,
|
||||
save_predictions=cfg.TEST.SAVE_PREDICTIONS,
|
||||
)
|
||||
|
||||
# renaming box_proposals metric to rpn_proposals if RPN_ONLY is True
|
||||
if results and 'box_proposal' in results and cfg.MODEL.RPN_ONLY:
|
||||
results['rpn_proposal'] = results.pop('box_proposal')
|
||||
|
||||
if results and output_folder:
|
||||
results_path = os.path.join(output_folder, "results.json")
|
||||
# checking if this file already exists and only updating tasks
|
||||
# that are already present. This is useful for including
|
||||
# e.g. RPN_ONLY metrics
|
||||
if os.path.isfile(results_path):
|
||||
with open(results_path, 'rt') as fin:
|
||||
old_results = json.load(fin)
|
||||
old_results.update(results)
|
||||
results = old_results
|
||||
with open(results_path, 'wt') as fout:
|
||||
json.dump(results, fout)
|
||||
|
||||
synchronize()
|
||||
|
||||
# evaluate attribute detection
|
||||
if not cfg.MODEL.RPN_ONLY and cfg.MODEL.ATTRIBUTE_ON and (not cfg.TEST.SKIP_PERFORMANCE_EVAL):
|
||||
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
|
||||
for output_folder, dataset_name, data_loader_val in zip(
|
||||
output_folders, dataset_names, data_loaders_val
|
||||
):
|
||||
results_attr = inference(
|
||||
model,
|
||||
cfg,
|
||||
data_loader_val,
|
||||
dataset_name=dataset_name,
|
||||
iou_types=iou_types,
|
||||
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
|
||||
device=cfg.MODEL.DEVICE,
|
||||
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
||||
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
||||
output_folder=output_folder,
|
||||
skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
|
||||
labelmap_file=labelmap_file,
|
||||
save_predictions=cfg.TEST.SAVE_PREDICTIONS,
|
||||
eval_attributes=True,
|
||||
)
|
||||
|
||||
if results_attr and output_folder:
|
||||
results_path = os.path.join(output_folder, "results.json")
|
||||
# checking if this file already exists and only updating tasks
|
||||
# that are already present. This is useful for including
|
||||
# e.g. RPN_ONLY metrics
|
||||
if os.path.isfile(results_path):
|
||||
with open(results_path, 'rt') as fin:
|
||||
old_results = json.load(fin)
|
||||
old_results.update(results_attr)
|
||||
results_attr = old_results
|
||||
with open(results_path, 'wt') as fout:
|
||||
json.dump(results_attr, fout)
|
||||
|
||||
synchronize()
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -52,7 +154,7 @@ def main():
|
|||
args = parser.parse_args()
|
||||
|
||||
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
|
||||
distributed = num_gpus > 1
|
||||
args.distributed = num_gpus > 1
|
||||
|
||||
cfg.set_new_allowed(True)
|
||||
cfg.merge_from_other_cfg(sg_cfg)
|
||||
|
@ -61,7 +163,7 @@ def main():
|
|||
cfg.merge_from_list(args.opts)
|
||||
cfg.freeze()
|
||||
|
||||
if distributed:
|
||||
if args.distributed:
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
torch.distributed.init_process_group(
|
||||
backend=cfg.DISTRIBUTED_BACKEND, init_method="env://"
|
||||
|
@ -82,47 +184,13 @@ def main():
|
|||
model = AttrRCNN(cfg)
|
||||
model.to(cfg.MODEL.DEVICE)
|
||||
|
||||
# Initialize mixed-precision if necessary
|
||||
# use_mixed_precision = cfg.DTYPE == 'float16'
|
||||
# amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
|
||||
|
||||
output_dir = cfg.OUTPUT_DIR
|
||||
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
|
||||
ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
|
||||
_ = checkpointer.load(ckpt, use_latest=args.ckpt is None)
|
||||
model_name = os.path.basename(ckpt)
|
||||
|
||||
iou_types = ("bbox",)
|
||||
if cfg.MODEL.MASK_ON:
|
||||
iou_types = iou_types + ("segm",)
|
||||
if cfg.MODEL.KEYPOINT_ON:
|
||||
iou_types = iou_types + ("keypoints",)
|
||||
output_folders = [None] * len(cfg.DATASETS.TEST)
|
||||
dataset_names = cfg.DATASETS.TEST
|
||||
if cfg.OUTPUT_DIR:
|
||||
for idx, dataset_name in enumerate(dataset_names):
|
||||
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
|
||||
mkdir(output_folder)
|
||||
output_folders[idx] = output_folder
|
||||
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
|
||||
labelmap_file = config_dataset_file(cfg.DATA_DIR, cfg.DATASETS.LABELMAP_FILE)
|
||||
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
|
||||
inference(
|
||||
model,
|
||||
cfg,
|
||||
data_loader_val,
|
||||
dataset_name=dataset_name,
|
||||
iou_types=iou_types,
|
||||
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
|
||||
bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
|
||||
device=cfg.MODEL.DEVICE,
|
||||
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
||||
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
||||
output_folder=output_folder,
|
||||
skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
|
||||
labelmap_file=labelmap_file,
|
||||
save_predictions=cfg.TEST.SAVE_PREDICTIONS,
|
||||
)
|
||||
synchronize()
|
||||
run_test(cfg, model, args.distributed, model_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -15,12 +15,14 @@ import torch
|
|||
from maskrcnn_benchmark.config import cfg
|
||||
from scene_graph_benchmark.config import sg_cfg
|
||||
from maskrcnn_benchmark.data import make_data_loader
|
||||
from maskrcnn_benchmark.data.datasets.utils.load_files import config_dataset_file
|
||||
from maskrcnn_benchmark.solver import make_lr_scheduler
|
||||
from maskrcnn_benchmark.solver import make_optimizer
|
||||
from maskrcnn_benchmark.solver import make_optimizer, make_optimizer_d2
|
||||
from maskrcnn_benchmark.engine.inference import inference
|
||||
from maskrcnn_benchmark.engine.trainer import do_train
|
||||
from maskrcnn_benchmark.modeling.detector import build_detection_model
|
||||
from scene_graph_benchmark.scene_parser import SceneParser
|
||||
from scene_graph_benchmark.AttrRCNN import AttrRCNN
|
||||
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
|
||||
from maskrcnn_benchmark.utils.collect_env import collect_env_info
|
||||
from maskrcnn_benchmark.utils.comm import synchronize, get_rank
|
||||
|
@ -28,13 +30,7 @@ from maskrcnn_benchmark.utils.imports import import_file
|
|||
from maskrcnn_benchmark.utils.logger import setup_logger
|
||||
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
|
||||
from maskrcnn_benchmark.utils.miscellaneous import mkdir, save_config
|
||||
|
||||
# See if we can use apex.DistributedDataParallel instead of the torch default,
|
||||
# and enable mixed-precision via apex.amp
|
||||
try:
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError('Use APEX for multi-precision via apex.amp')
|
||||
from tools.test_sg_net import run_test
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
|
@ -50,23 +46,24 @@ torch.backends.cudnn.deterministic = True
|
|||
|
||||
|
||||
def train(cfg, local_rank, distributed):
|
||||
model = SceneParser(cfg)
|
||||
if cfg.MODEL.META_ARCHITECTURE == "SceneParser":
|
||||
model = SceneParser(cfg)
|
||||
elif cfg.MODEL.META_ARCHITECTURE == "AttrRCNN":
|
||||
model = AttrRCNN(cfg)
|
||||
device = torch.device(cfg.MODEL.DEVICE)
|
||||
model.to(device)
|
||||
|
||||
optimizer = make_optimizer(cfg, model)
|
||||
if cfg.MODEL.BACKBONE.CONV_BODY.startswith("ViL"):
|
||||
optimizer = make_optimizer_d2(cfg, model)
|
||||
else:
|
||||
optimizer = make_optimizer(cfg, model)
|
||||
scheduler = make_lr_scheduler(cfg, optimizer)
|
||||
|
||||
# # Initialize mixed-precision training
|
||||
# use_mixed_precision = cfg.DTYPE == "float16"
|
||||
# amp_opt_level = 'O1' if use_mixed_precision else 'O0'
|
||||
# model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
|
||||
|
||||
if distributed:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[local_rank], output_device=local_rank,
|
||||
# this should be removed if we update BatchNorm stats
|
||||
broadcast_buffers=False,
|
||||
broadcast_buffers=False, find_unused_parameters=True
|
||||
)
|
||||
|
||||
arguments = {}
|
||||
|
@ -116,39 +113,6 @@ def train(cfg, local_rank, distributed):
|
|||
return model
|
||||
|
||||
|
||||
def run_test(cfg, model, distributed):
|
||||
if distributed:
|
||||
model = model.module
|
||||
torch.cuda.empty_cache() # TODO check if it helps
|
||||
iou_types = ("bbox",)
|
||||
if cfg.MODEL.MASK_ON:
|
||||
iou_types = iou_types + ("segm",)
|
||||
if cfg.MODEL.KEYPOINT_ON:
|
||||
iou_types = iou_types + ("keypoints",)
|
||||
output_folders = [None] * len(cfg.DATASETS.TEST)
|
||||
dataset_names = cfg.DATASETS.TEST
|
||||
if cfg.OUTPUT_DIR:
|
||||
for idx, dataset_name in enumerate(dataset_names):
|
||||
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
|
||||
mkdir(output_folder)
|
||||
output_folders[idx] = output_folder
|
||||
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
|
||||
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
|
||||
inference(
|
||||
model,
|
||||
data_loader_val,
|
||||
dataset_name=dataset_name,
|
||||
iou_types=iou_types,
|
||||
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
|
||||
bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
|
||||
device=cfg.MODEL.DEVICE,
|
||||
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
||||
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
||||
output_folder=output_folder,
|
||||
)
|
||||
synchronize()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
|
||||
parser.add_argument(
|
||||
|
@ -216,7 +180,7 @@ def main():
|
|||
model = train(cfg, args.local_rank, args.distributed)
|
||||
|
||||
if not args.skip_test:
|
||||
run_test(cfg, model, args.distributed)
|
||||
run_test(cfg, model, args.distributed, model_name="model_final")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Загрузка…
Ссылка в новой задаче