Merge pull request #26 from microsoft/torch17

Torch17
This commit is contained in:
Xiaotian Han 2021-06-28 17:55:14 -07:00 коммит произвёл GitHub
Родитель 2df1912d1e 462a68f3e1
Коммит a93180e85c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
79 изменённых файлов: 2829 добавлений и 391 удалений

Просмотреть файл

@ -1,10 +1,11 @@
## Installation
### Requirements:
- PyTorch 1.4
- PyTorch 1.7
- torchvision
- cocoapi
- yacs
- yacs>=0.1.8
- numpy>=1.19.5
- matplotlib
- GCC >= 4.9
- OpenCV
@ -25,9 +26,10 @@ conda activate sg_benchmark
conda install ipython h5py nltk joblib jupyter pandas scipy
# maskrcnn_benchmark and coco api dependencies
pip install ninja yacs==0.1.8 cython matplotlib tqdm opencv-python numpy=1.19.5
pip install ninja yacs>=0.1.8 cython matplotlib tqdm opencv-python numpy>=1.19.5
conda install pytorch==1.4.0 torchvision==0.5.0 cudatoolkit=10.1 -c pytorch
conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch
conda install -c conda-forge timm einops
# install pycocotools
conda install -c conda-forge pycocotools
@ -35,9 +37,6 @@ conda install -c conda-forge pycocotools
# install cityscapesScripts
python -m pip install cityscapesscripts
# install apex
conda install -c conda-forge nvidia-apex
# install Scene Graph Detection
git clone https://github.com/microsoft/scene_graph_benchmark
cd scene_graph_benchmark

Просмотреть файл

@ -1,15 +1,12 @@
# Scene Graph Benchmark in PyTorch 1.4
# Scene Graph Benchmark in PyTorch 1.7
**This project is based on [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark)**
This project aims at providing the necessary building blocks for easily
creating detection and segmentation models using PyTorch 1.0.
![alt text](demo/R152FPN_demo.png "from https://storage.googleapis.com/openimages/web/index.html")
## Highlights
- **Upgrad to pytorch 1.4 (can also upgrade to 1.7)**
- **Upgrad to pytorch 1.7**
- **Multi-GPU training and inference**
- **Batched inference:** can perform inference using multiple images per batch per GPU.
- **Fast and flexible tsv dataset format**
@ -35,15 +32,16 @@ Here is how we would do it. Run the following commands:
# visualize VinVL object detection
# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
# the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file ../maskrcnn-benchmark-1/datasets1/imgs/woman_fish.jpg --save_file output/woman_fish_x152c4.obj.jpg MODEL.WEIGHT models/vinvl/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 DATA_DIR "../maskrcnn-benchmark-1/datasets1" TEST.IGNORE_BOX_REGRESSION False
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file demo/woman_fish.jpg --save_file output/woman_fish_x152c4.obj.jpg MODEL.WEIGHT pretrained_model/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 TEST.IGNORE_BOX_REGRESSION False
# visualize VinVL object-attribute detection
# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
# the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file ../maskrcnn-benchmark-1/datasets1/imgs/woman_fish.jpg --save_file output/woman_fish_x152c4.attr.jpg --visualize_attr MODEL.WEIGHT models/vinvl/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 DATA_DIR "../maskrcnn-benchmark-1/datasets1" TEST.IGNORE_BOX_REGRESSION False
python tools/demo/demo_image.py --config_file sgg_configs/vgattr/vinvl_x152c4.yaml --img_file demo/woman_fish.jpg --save_file output/woman_fish_x152c4.attr.jpg --visualize_attr MODEL.WEIGHT pretrained_model/vinvl_vg_x152c4.pth MODEL.ROI_HEADS.NMS_FILTER 1 MODEL.ROI_HEADS.SCORE_THRESH 0.2 TEST.IGNORE_BOX_REGRESSION False
# visualize OpenImage scene graph generation by RelDN
python tools/demo/demo_image.py --config_file sgg_configs/vrd/R152FPN_vrd_reldn.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa_output.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False
# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/sgg_model_zoo/sgg_oi_vrd_model_zoo/RX152FPN_reldn_oi_best.pth
python tools/demo/demo_image.py --config_file sgg_configs/vrd/R152FPN_vrd_reldn.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file output/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.reldn_relation.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False
# visualize Visual Genome scene graph generation by neural motif
python tools/demo/demo_image.py --config_file sgg_configs/vg_vrd/rel_danfeiX_FPN50_nm.yaml --img_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa.jpg --save_file demo/1024px-Gen_Robert_E_Lee_on_Traveler_at_Gettysburg_Pa_vgnm.jpg --visualize_relation MODEL.ROI_RELATION_HEAD.DETECTOR_PRE_CALCULATED False DATASETS.LABELMAP_FILE "visualgenome/VG-SGG-dicts-danfeiX-clipped.json" DATA_DIR /home/penzhan/GitHub/maskrcnn-benchmark-1/datasets1 MODEL.ROI_RELATION_HEAD.USE_BIAS True MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP True MODEL.ROI_HEADS.DETECTIONS_PER_IMG 64 MODEL.ROI_RELATION_HEAD.SHARE_BOX_FEATURE_EXTRACTOR False MODEL.ROI_RELATION_HEAD.NEURAL_MOTIF.OBJ_LSTM_NUM_LAYERS 0 MODEL.ROI_RELATION_HEAD.NEURAL_MOTIF.EDGE_LSTM_NUM_LAYERS 2 TEST.IMS_PER_BATCH 2

Просмотреть файл

@ -273,7 +273,7 @@ class COCODemo(object):
the BoxList via `prediction.fields()`
"""
scores = predictions.get_field("scores")
keep = torch.nonzero(scores > self.confidence_threshold).squeeze(1)
keep = torch.nonzero(scores > self.confidence_threshold, as_tuple=False).squeeze(1)
predictions = predictions[keep]
scores = predictions.get_field("scores")
_, idx = scores.sort(0, descending=True)

Двоичные данные
demo/woman_fish.jpg Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 1.7 MiB

Просмотреть файл

@ -1,7 +1,7 @@
ARG CUDA="10.1"
ARG CUDNN="7"
FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04
FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu18.04
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
@ -36,20 +36,20 @@ RUN pip --no-cache-dir install --force-reinstall -I pyyaml
RUN python -m nltk.downloader punkt
# Install latest PyTorch 1.4
# Install latest PyTorch 1.7.1
ARG CUDA
RUN conda install pytorch~=1.4.0 torchvision cudatoolkit=${CUDA} -c pytorch \
RUN conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.1 -c pytorch \
&& conda clean -ya
RUN conda install -y -c conda-forge timm einops
# install pycocotools
RUN git clone https://github.com/cocodataset/cocoapi.git \
&& cd cocoapi/PythonAPI \
&& python setup.py build_ext install
# RUN git clone https://github.com/cocodataset/cocoapi.git \
# && cd cocoapi/PythonAPI \
# && python setup.py build_ext install
RUN conda install -y -c conda-forge pycocotools
# install apex
RUN git clone https://github.com/NVIDIA/apex.git \
&& cd apex \
&& python setup.py install --cuda_ext --cpp_ext
# install cityscapesScripts
RUN python -m pip install cityscapesscripts
# install PyTorch Detection
ARG FORCE_CUDA="1"
@ -61,7 +61,7 @@ RUN echo """syntax on\nfiletype indent on\nset autoindent\nset number\ncolorsche
CMD [ "zsh" ]
# RUN git clone https://github.com/hanxiaotian/scene_graph_benchmark.git \
# RUN git clone https://github.com/microsoft/scene_graph_benchmark.git \
# && cd scene_graph_benchmark \
# && python setup.py build develop

Просмотреть файл

@ -56,11 +56,6 @@ RUN git clone https://github.com/cocodataset/cocoapi.git \
&& cd cocoapi/PythonAPI \
&& python setup.py build_ext install
# install apex
RUN git clone https://github.com/NVIDIA/apex.git \
&& cd apex \
&& python setup.py install --cuda_ext --cpp_ext
# install PyTorch Detection
ARG FORCE_CUDA="1"
ENV FORCE_CUDA=${FORCE_CUDA}

Просмотреть файл

@ -178,6 +178,8 @@ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
_C.MODEL.RPN.FPN_POST_NMS_PER_BATCH = True
# Custom rpn head, empty to use default conv or separable conv
_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
# use gt target box as proposals for roi_heads (shared in training and testing)
_C.MODEL.RPN.FORCE_BOXES = False
# ---------------------------------------------------------------------------- #
@ -302,6 +304,28 @@ _C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
# ---------------------------------------------------------------------------- #
# Vision Transformer Options
# ---------------------------------------------------------------------------- #
_C.MODEL.TRANSFORMER = CN()
_C.MODEL.TRANSFORMER.DROP = 0.0
_C.MODEL.TRANSFORMER.DROP_PATH = 0.1
_C.MODEL.TRANSFORMER.NORM_EMBED = True
_C.MODEL.TRANSFORMER.AVG_POOL = False
_C.MODEL.TRANSFORMER.VITHEADARCH = 'l4,h12,d768,n1,s0,g0,p2,f7,a0'
_C.MODEL.TRANSFORMER.MSVIT = CN()
_C.MODEL.TRANSFORMER.MSVIT.ARCH = 'l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n2,s1,g1,p2,f7,a0_l3,h6,d384,n8,s1,g1,p2,f7,a0_l4,h12,d768,n1,s1,g0,p2,f7,a0'
_C.MODEL.TRANSFORMER.MSVIT.SHARE_W = True
_C.MODEL.TRANSFORMER.MSVIT.ATTN_TYPE = 'longformerhand'
_C.MODEL.TRANSFORMER.MSVIT.SHARE_KV = True
_C.MODEL.TRANSFORMER.MSVIT.ONLY_GLOBAL = False
_C.MODEL.TRANSFORMER.MSVIT.SW_EXACT = 0
_C.MODEL.TRANSFORMER.MSVIT.LN_EPS = 1e-6
_C.MODEL.TRANSFORMER.MSVIT.MODE = 0
_C.MODEL.TRANSFORMER.MSVIT.REDRAW_INTERVAL = 1000
_C.MODEL.TRANSFORMER.OUT_FEATURES = []
# ---------------------------------------------------------------------------- #
# RetinaNet Options (Follow the Detectron version)
@ -430,6 +454,15 @@ _C.SOLVER.TEST_PERIOD = 0
# see 2 images per batch
_C.SOLVER.IMS_PER_BATCH = 16
_C.SOLVER.USE_AMP = False
_C.SOLVER.OPTIMIZER = 'SGD' # also support ADAMW
_C.SOLVER.CLIP_GRADIENTS = CN()
_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
# ---------------------------------------------------------------------------- #
# Specific test options
# ---------------------------------------------------------------------------- #
@ -487,6 +520,7 @@ _C.TEST.IGNORE_BOX_REGRESSION = False
_C.OUTPUT_DIR = "."
_C.DATA_DIR = "./datasets"
_C.DISTRIBUTED_BACKEND = "nccl" # could be "nccl", "gloo" or "mpi"
_C.LOG_LOSS_PERIOD = 20
_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
@ -496,6 +530,3 @@ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
# Precision of input, allowable: (float32, float16)
_C.DTYPE = "float32"
# Enable verbosity in apex.amp
_C.AMP_VERBOSE = False

Просмотреть файл

@ -14,7 +14,7 @@ at::Tensor ROIAlign_forward(const at::Tensor& input,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
#else
@ -34,7 +34,7 @@ at::Tensor ROIAlign_backward(const at::Tensor& grad,
const int height,
const int width,
const int sampling_ratio) {
if (grad.type().is_cuda()) {
if (grad.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
#else

Просмотреть файл

@ -13,7 +13,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
const float spatial_scale,
const int pooled_height,
const int pooled_width) {
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
#else
@ -34,7 +34,7 @@ at::Tensor ROIPool_backward(const at::Tensor& grad,
const int channels,
const int height,
const int width) {
if (grad.type().is_cuda()) {
if (grad.device().is_cuda()) {
#ifdef WITH_CUDA
return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
#else

Просмотреть файл

@ -13,7 +13,7 @@ at::Tensor SigmoidFocalLoss_forward(
const int num_classes,
const float gamma,
const float alpha) {
if (logits.type().is_cuda()) {
if (logits.device().is_cuda()) {
#ifdef WITH_CUDA
return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
#else
@ -30,7 +30,7 @@ at::Tensor SigmoidFocalLoss_backward(
const int num_classes,
const float gamma,
const float alpha) {
if (logits.type().is_cuda()) {
if (logits.device().is_cuda()) {
#ifdef WITH_CUDA
return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
#else

Просмотреть файл

@ -91,7 +91,7 @@ void pre_calc_for_bilinear_interpolate(
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
// save weights and indeces
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
@ -168,8 +168,8 @@ void ROIAlignForward_cpu_kernel(
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std::vector<PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
@ -224,8 +224,8 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
auto num_rois = rois.size(0);
auto channels = input.size(1);
@ -239,10 +239,10 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
return output;
}
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
ROIAlignForward_cpu_kernel<scalar_t>(
output_size,
input.data<scalar_t>(),
input.data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
@ -250,8 +250,8 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
pooled_height,
pooled_width,
sampling_ratio,
rois.data<scalar_t>(),
output.data<scalar_t>());
rois.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
return output;
}

Просмотреть файл

@ -6,8 +6,8 @@ template <typename scalar_t>
at::Tensor nms_cpu_kernel(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
if (dets.numel() == 0) {
@ -26,13 +26,13 @@ at::Tensor nms_cpu_kernel(const at::Tensor& dets,
auto ndets = dets.size(0);
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data<uint8_t>();
auto order = order_t.data<int64_t>();
auto x1 = x1_t.data<scalar_t>();
auto y1 = y1_t.data<scalar_t>();
auto x2 = x2_t.data<scalar_t>();
auto y2 = y2_t.data<scalar_t>();
auto areas = areas_t.data<scalar_t>();
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<scalar_t>();
auto y1 = y1_t.data_ptr<scalar_t>();
auto x2 = x2_t.data_ptr<scalar_t>();
auto y2 = y2_t.data_ptr<scalar_t>();
auto areas = areas_t.data_ptr<scalar_t>();
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
@ -68,7 +68,7 @@ at::Tensor nms_cpu(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
at::Tensor result;
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
});
return result;

Просмотреть файл

@ -260,8 +260,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
const int pooled_height,
const int pooled_width,
const int sampling_ratio) {
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
auto num_rois = rois.size(0);
auto channels = input.size(1);
@ -272,7 +272,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
dim3 block(512);
if (output.numel() == 0) {
@ -280,10 +280,10 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
return output;
}
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input.contiguous().data<scalar_t>(),
input.contiguous().data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
@ -291,8 +291,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
pooled_height,
pooled_width,
sampling_ratio,
rois.contiguous().data<scalar_t>(),
output.data<scalar_t>());
rois.contiguous().data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
THCudaCheck(cudaGetLastError());
return output;
@ -309,15 +309,15 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
const int height,
const int width,
const int sampling_ratio) {
AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
auto num_rois = rois.size(0);
auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
dim3 block(512);
// handle possibly empty gradients
@ -326,10 +326,10 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
return grad_input;
}
AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad.contiguous().data<scalar_t>(),
grad.contiguous().data_ptr<scalar_t>(),
num_rois,
spatial_scale,
channels,
@ -338,8 +338,8 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
pooled_height,
pooled_width,
sampling_ratio,
grad_input.data<scalar_t>(),
rois.contiguous().data<scalar_t>());
grad_input.data_ptr<scalar_t>(),
rois.contiguous().data_ptr<scalar_t>());
});
THCudaCheck(cudaGetLastError());
return grad_input;

Просмотреть файл

@ -112,8 +112,8 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
const float spatial_scale,
const int pooled_height,
const int pooled_width) {
AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
auto num_rois = rois.size(0);
auto channels = input.size(1);
@ -126,7 +126,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
dim3 block(512);
if (output.numel() == 0) {
@ -134,19 +134,19 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
return std::make_tuple(output, argmax);
}
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input.contiguous().data<scalar_t>(),
input.contiguous().data_ptr<scalar_t>(),
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
rois.contiguous().data<scalar_t>(),
output.data<scalar_t>(),
argmax.data<int>());
rois.contiguous().data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(),
argmax.data_ptr<int>());
});
THCudaCheck(cudaGetLastError());
return std::make_tuple(output, argmax);
@ -164,8 +164,8 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
const int channels,
const int height,
const int width) {
AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
// TODO add more checks
auto num_rois = rois.size(0);
@ -173,7 +173,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
dim3 block(512);
// handle possibly empty gradients
@ -182,11 +182,11 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
return grad_input;
}
AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad.contiguous().data<scalar_t>(),
argmax.data<int>(),
grad.contiguous().data_ptr<scalar_t>(),
argmax.data_ptr<int>(),
num_rois,
spatial_scale,
channels,
@ -194,8 +194,8 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
width,
pooled_height,
pooled_width,
grad_input.data<scalar_t>(),
rois.contiguous().data<scalar_t>());
grad_input.data_ptr<scalar_t>(),
rois.contiguous().data_ptr<scalar_t>());
});
THCudaCheck(cudaGetLastError());
return grad_input;

Просмотреть файл

@ -107,8 +107,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
const int num_classes,
const float gamma,
const float alpha) {
AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
const int num_samples = logits.size(0);
@ -117,8 +117,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
auto losses_size = num_samples * logits.size(1);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L));
dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
dim3 block(512);
if (losses.numel() == 0) {
@ -126,16 +125,16 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
return losses;
}
AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
losses_size,
logits.contiguous().data<scalar_t>(),
targets.contiguous().data<int>(),
logits.contiguous().data_ptr<scalar_t>(),
targets.contiguous().data_ptr<int>(),
num_classes,
gamma,
alpha,
num_samples,
losses.data<scalar_t>());
losses.data_ptr<scalar_t>());
});
THCudaCheck(cudaGetLastError());
return losses;
@ -149,9 +148,9 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
const int num_classes,
const float gamma,
const float alpha) {
AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
@ -162,7 +161,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
auto d_logits_size = num_samples * logits.size(1);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L));
dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
dim3 block(512);
if (d_logits.numel() == 0) {
@ -170,17 +169,17 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
return d_logits;
}
AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
d_logits_size,
logits.contiguous().data<scalar_t>(),
targets.contiguous().data<int>(),
d_losses.contiguous().data<scalar_t>(),
logits.contiguous().data_ptr<scalar_t>(),
targets.contiguous().data_ptr<int>(),
d_losses.contiguous().data_ptr<scalar_t>(),
num_classes,
gamma,
alpha,
num_samples,
d_logits.data<scalar_t>());
d_logits.data_ptr<scalar_t>());
});
THCudaCheck(cudaGetLastError());

Просмотреть файл

@ -1,10 +1,6 @@
// modify from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
#ifndef AT_CHECK
#define AT_CHECK TORCH_CHECK
#endif
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
@ -73,26 +69,26 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
int padW, int dilationH, int dilationW, int group,
int deformable_group)
{
AT_CHECK(weight.ndimension() == 4,
TORCH_CHECK(weight.ndimension() == 4,
"4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
"but got: %s",
weight.ndimension());
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
AT_CHECK(kW > 0 && kH > 0,
TORCH_CHECK(kW > 0 && kH > 0,
"kernel size should be greater than zero, but got kH: %d kW: %d", kH,
kW);
AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
"kernel size should be consistent with weight, ",
"but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
kW, weight.size(2), weight.size(3));
AT_CHECK(dW > 0 && dH > 0,
TORCH_CHECK(dW > 0 && dH > 0,
"stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
AT_CHECK(
TORCH_CHECK(
dilationW > 0 && dilationH > 0,
"dilation should be greater than 0, but got dilationH: %d dilationW: %d",
dilationH, dilationW);
@ -108,7 +104,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
dimw++;
}
AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
ndim);
long nInputPlane = weight.size(1) * group;
@ -120,7 +116,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
long outputWidth =
(inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
AT_CHECK(nInputPlane % deformable_group == 0,
TORCH_CHECK(nInputPlane % deformable_group == 0,
"input channels must divide deformable group size");
if (outputWidth < 1 || outputHeight < 1)
@ -130,27 +126,27 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
outputWidth);
AT_CHECK(input.size(1) == nInputPlane,
TORCH_CHECK(input.size(1) == nInputPlane,
"invalid number of input planes, expected: %d, but got: %d",
nInputPlane, input.size(1));
AT_CHECK((inputHeight >= kH && inputWidth >= kW),
TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
"input image is smaller than kernel");
AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
"invalid spatial size of offset, expected height: %d width: %d, but "
"got height: %d width: %d",
outputHeight, outputWidth, offset.size(2), offset.size(3));
AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
"invalid number of channels of offset");
if (gradOutput != NULL) {
AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
"invalid number of gradOutput planes, expected: %d, but got: %d",
nOutputPlane, gradOutput->size(dimf));
AT_CHECK((gradOutput->size(dimh) == outputHeight &&
TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
gradOutput->size(dimw) == outputWidth),
"invalid size of gradOutput, expected height: %d width: %d , but "
"got height: %d width: %d",
@ -201,7 +197,7 @@ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
outputHeight, outputWidth});
@ -308,7 +304,7 @@ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@ -424,7 +420,7 @@ int deform_conv_backward_parameters_cuda(
long outputHeight =
(inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
columns = at::zeros(
{nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@ -505,8 +501,8 @@ void modulated_deform_conv_cuda_forward(
const int dilation_w, const int group, const int deformable_group,
const bool with_bias)
{
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
@ -587,8 +583,8 @@ void modulated_deform_conv_cuda_backward(
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias)
{
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);

Просмотреть файл

@ -264,10 +264,10 @@ void deformable_im2col(
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.type(), "deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data<scalar_t>();
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
scalar_t *data_col_ = data_col.data<scalar_t>();
data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
@ -358,10 +358,10 @@ void deformable_col2im(
int channel_per_deformable_group = channels / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data<scalar_t>();
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
scalar_t *grad_im_ = grad_im.data<scalar_t>();
data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
@ -456,11 +456,11 @@ void deformable_col2im_coord(
int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data<scalar_t>();
const scalar_t *data_im_ = data_im.data<scalar_t>();
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
@ -786,11 +786,11 @@ void modulated_deformable_im2col_cuda(
const int num_kernels = channels * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data<scalar_t>();
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
const scalar_t *data_mask_ = data_mask.data<scalar_t>();
scalar_t *data_col_ = data_col.data<scalar_t>();
data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
@ -818,11 +818,11 @@ void modulated_deformable_col2im_cuda(
const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data<scalar_t>();
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
const scalar_t *data_mask_ = data_mask.data<scalar_t>();
scalar_t *grad_im_ = grad_im.data<scalar_t>();
data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
@ -851,13 +851,13 @@ void modulated_deformable_col2im_coord_cuda(
const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data<scalar_t>();
const scalar_t *data_im_ = data_im.data<scalar_t>();
const scalar_t *data_offset_ = data_offset.data<scalar_t>();
const scalar_t *data_mask_ = data_mask.data<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,

Просмотреть файл

@ -5,10 +5,6 @@
// author: Charles Shang
// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
#ifndef AT_CHECK
#define AT_CHECK TORCH_CHECK
#endif
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
@ -43,7 +39,7 @@ void deform_psroi_pooling_cuda_forward(
const int output_dim, const int group_size, const int pooled_size,
const int part_size, const int sample_per_part, const float trans_std)
{
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);
@ -69,8 +65,8 @@ void deform_psroi_pooling_cuda_backward(
const int group_size, const int pooled_size, const int part_size,
const int sample_per_part, const float trans_std)
{
AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
const int batch = input.size(0);
const int channels = input.size(1);

Просмотреть файл

@ -290,12 +290,12 @@ void DeformablePSROIPoolForward(const at::Tensor data,
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data.type(), "deformable_psroi_pool_forward", ([&] {
const scalar_t *bottom_data = data.data<scalar_t>();
const scalar_t *bottom_rois = bbox.data<scalar_t>();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
scalar_t *top_data = out.data<scalar_t>();
scalar_t *top_count_data = top_count.data<scalar_t>();
data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
const scalar_t *bottom_data = data.data_ptr<scalar_t>();
const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
scalar_t *top_data = out.data_ptr<scalar_t>();
scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
@ -341,14 +341,14 @@ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
const scalar_t *top_diff = out_grad.data<scalar_t>();
const scalar_t *bottom_data = data.data<scalar_t>();
const scalar_t *bottom_rois = bbox.data<scalar_t>();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
const scalar_t *top_count_data = top_count.data<scalar_t>();
out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
const scalar_t *top_diff = out_grad.data_ptr<scalar_t>();
const scalar_t *bottom_data = data.data_ptr<scalar_t>();
const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>();
scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>();
const scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,

Просмотреть файл

@ -69,7 +69,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
// boxes is a N x 5 tensor
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
@ -78,7 +78,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
@ -106,7 +106,7 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();
int64_t* keep_out = keep.data_ptr<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {

Просмотреть файл

@ -27,7 +27,7 @@ int deform_conv_forward(
int deformable_group,
int im2col_step)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return deform_conv_forward_cuda(
input, weight, offset, output, columns, ones,
@ -62,7 +62,7 @@ int deform_conv_backward_input(
int deformable_group,
int im2col_step)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return deform_conv_backward_input_cuda(
input, offset, gradOutput, gradInput, gradOffset, weight, columns,
@ -97,7 +97,7 @@ int deform_conv_backward_parameters(
float scale,
int im2col_step)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return deform_conv_backward_parameters_cuda(
input, offset, gradOutput, gradWeight, columns, ones,
@ -133,7 +133,7 @@ void modulated_deform_conv_forward(
const int deformable_group,
const bool with_bias)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return modulated_deform_conv_cuda_forward(
input, weight, bias, ones, offset, mask, output, columns,
@ -175,7 +175,7 @@ void modulated_deform_conv_backward(
int deformable_group,
const bool with_bias)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return modulated_deform_conv_cuda_backward(
input, weight, bias, ones, offset, mask, columns,

Просмотреть файл

@ -23,7 +23,7 @@ void deform_psroi_pooling_forward(
const int sample_per_part,
const float trans_std)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return deform_psroi_pooling_cuda_forward(
input, bbox, trans, out, top_count,
@ -55,7 +55,7 @@ void deform_psroi_pooling_backward(
const int sample_per_part,
const float trans_std)
{
if (input.type().is_cuda()) {
if (input.device().is_cuda()) {
#ifdef WITH_CUDA
return deform_psroi_pooling_cuda_backward(
out_grad, input, bbox, trans, top_count, input_grad, trans_grad,

Просмотреть файл

@ -11,7 +11,7 @@ at::Tensor nms(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
if (dets.type().is_cuda()) {
if (dets.device().is_cuda()) {
#ifdef WITH_CUDA
// TODO raise error if not compiled with CUDA
if (dets.numel() == 0)

Просмотреть файл

@ -14,7 +14,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
// dcn-v2
m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
@ -22,4 +21,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
}
}

Просмотреть файл

@ -6,6 +6,8 @@ from .voc import voc_evaluation
from .cityscapes import abs_cityscapes_evaluation
from .sg import sg_evaluation
from .openimages_vrd import openimages_vrd_evaluation
from .vg import vg_evaluation
def evaluate(dataset, predictions, output_folder, **kwargs):
"""evaluate dataset using different methods based on dataset type.
@ -28,7 +30,10 @@ def evaluate(dataset, predictions, output_folder, **kwargs):
elif isinstance(dataset, datasets.OpenImagesVRDTSVDataset):
return openimages_vrd_evaluation(**args)
elif isinstance(dataset, datasets.VGTSVDataset):
return sg_evaluation(**args)
if 'sg_eval' in args and args['sg_eval']:
return sg_evaluation(**args)
else:
return vg_evaluation(**args)
elif isinstance(dataset, datasets.AbstractDataset):
return abs_cityscapes_evaluation(**args)
else:

Просмотреть файл

@ -120,18 +120,18 @@ def evaluate(gt_classes, gt_boxes, gt_rels,
return (None, None)
rel_sum = ((gt_rels.sum(1) > 0).int() + (gt_rels.sum(0) > 0).int())
ix_w_rel = rel_sum.nonzero().numpy().squeeze()
ix_w_rel = rel_sum.nonzero(as_tuple=False).numpy().squeeze()
# label = (((gt_rel_label.sum(1) == 0).int() + (gt_rel_label.sum(0) == 0).int()) == 2)
# change_ix = label.nonzero()
# change_ix = label.nonzero(as_tuple=False)
gt_boxes = gt_boxes.numpy()
num_gt_boxes = gt_boxes.shape[0]
gt_relations = gt_rels.nonzero().numpy()
gt_relations = gt_rels.nonzero(as_tuple=False).numpy()
gt_classes = gt_classes.view(-1, 1).numpy()
gt_rels_view = gt_rels.contiguous().view(-1)
gt_pred_labels = gt_rels_view[gt_rels_view.nonzero().squeeze()].contiguous().view(-1, 1).numpy()
gt_pred_labels = gt_rels_view[gt_rels_view.nonzero(as_tuple=False).squeeze()].contiguous().view(-1, 1).numpy()
num_gt_relations = gt_relations.shape[0]
if num_gt_relations == 0:

Просмотреть файл

@ -0,0 +1,119 @@
import torch
from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
# inspired from Detectron
def evaluate_box_proposals(
predictions, dataset, thresholds=None, area="all", limit=None
):
"""Evaluate detection proposal recall metrics. This function is a much
faster alternative to the official COCO API recall evaluation code. However,
it produces slightly different results.
"""
# Record max overlap value for each gt box
# Return vector of overlap values
areas = {
"all": 0,
"small": 1,
"medium": 2,
"large": 3,
"96-128": 4,
"128-256": 5,
"256-512": 6,
"512-inf": 7,
}
area_ranges = [
[0 ** 2, 1e5 ** 2], # all
[0 ** 2, 32 ** 2], # small
[32 ** 2, 96 ** 2], # medium
[96 ** 2, 1e5 ** 2], # large
[96 ** 2, 128 ** 2], # 96-128
[128 ** 2, 256 ** 2], # 128-256
[256 ** 2, 512 ** 2], # 256-512
[512 ** 2, 1e5 ** 2],
] # 512-inf
assert area in areas, "Unknown area range: {}".format(area)
area_range = area_ranges[areas[area]]
gt_overlaps = []
num_pos = 0
for image_id, prediction in sorted(predictions.items()):
img_info = dataset.get_img_info(image_id)
image_width = img_info["width"]
image_height = img_info["height"]
prediction = prediction.resize((image_width, image_height))
# deal with ground truth
gt_boxes = dataset.get_groundtruth(image_id)
# filter out the field "relations"
gt_boxes = gt_boxes.copy_with_fields(['attributes', 'labels'])
gt_areas = gt_boxes.area()
if len(gt_boxes) == 0:
continue
valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
gt_boxes = gt_boxes[valid_gt_inds]
num_pos += len(gt_boxes)
if len(gt_boxes) == 0:
continue
# sort predictions in descending order
# TODO maybe remove this and make it explicit in the documentation
_gt_overlaps = torch.zeros(len(gt_boxes))
if len(prediction) == 0:
gt_overlaps.append(_gt_overlaps)
continue
if "objectness" in prediction.extra_fields:
inds = prediction.get_field("objectness").sort(descending=True)[1]
elif "scores" in prediction.extra_fields:
inds = prediction.get_field("scores").sort(descending=True)[1]
else:
raise ValueError("Neither objectness nor scores is in the extra_fields!")
prediction = prediction[inds]
if limit is not None and len(prediction) > limit:
prediction = prediction[:limit]
overlaps = boxlist_iou(prediction, gt_boxes)
for j in range(min(len(prediction), len(gt_boxes))):
# find which proposal box maximally covers each gt box
# and get the iou amount of coverage for each gt box
max_overlaps, argmax_overlaps = overlaps.max(dim=0)
# find which gt box is 'best' covered (i.e. 'best' = most iou)
gt_ovr, gt_ind = max_overlaps.max(dim=0)
assert gt_ovr >= 0
# find the proposal box that covers the best covered gt box
box_ind = argmax_overlaps[gt_ind]
# record the iou coverage of this gt box
_gt_overlaps[j] = overlaps[box_ind, gt_ind]
assert _gt_overlaps[j] == gt_ovr
# mark the proposal box and the gt box as used
overlaps[box_ind, :] = -1
overlaps[:, gt_ind] = -1
# append recorded iou coverage level
gt_overlaps.append(_gt_overlaps)
gt_overlaps = torch.cat(gt_overlaps, dim=0)
gt_overlaps, _ = torch.sort(gt_overlaps)
if thresholds is None:
step = 0.05
thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
recalls = torch.zeros_like(thresholds)
# compute recall for each iou threshold
for i, t in enumerate(thresholds):
recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
# ar = 2 * np.trapz(recalls, thresholds)
ar = recalls.mean()
return {
"ar": ar,
"recalls": recalls,
"thresholds": thresholds,
"gt_overlaps": gt_overlaps,
"num_pos": num_pos,
}

Просмотреть файл

@ -0,0 +1,16 @@
import logging
from .vg_eval import do_vg_evaluation
def vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes, **_):
logger = logging.getLogger("maskrcnn_benchmark.inference")
logger.info("performing vg evaluation, ignored iou_types.")
return do_vg_evaluation(
dataset=dataset,
predictions=predictions,
output_folder=output_folder,
box_only=box_only,
eval_attributes=eval_attributes,
logger=logger,
)

Просмотреть файл

@ -0,0 +1,391 @@
# A modification version from chainercv repository.
# (See https://github.com/chainer/chainercv/blob/master/chainercv/evaluations/eval_detection_voc.py)
from __future__ import division
import os
import numpy as np
import torch
from maskrcnn_benchmark.structures.bounding_box import BoxList
from maskrcnn_benchmark.data.datasets.evaluation.utils import evaluate_box_proposals
def do_vg_evaluation(dataset, predictions, output_folder, box_only, eval_attributes, logger, save_predictions=True):
# TODO need to make the use_07_metric format available
# for the user to choose
# we use int for box_only. 0: False, 1: box for RPN, 2: box for object detection,
if box_only:
if box_only == 1:
limits = [100, 1000]
elif box_only == 2:
limits = [36, 99]
else:
raise ValueError("box_only can be either 0/1/2, but get {0}".format(box_only))
areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
result = {}
for area, suffix in areas.items():
for limit in limits:
logger.info("Evaluating bbox proposals@{:d}".format(limit))
stats = evaluate_box_proposals(
predictions, dataset, area=area, limit=limit
)
key_ar = "AR{}@{:d}".format(suffix, limit)
key_num_pos = "num_pos{}@{:d}".format(suffix, limit)
result[key_num_pos] = stats["num_pos"]
result[key_ar] = stats["ar"].item()
key_recalls = "Recalls{}@{:d}".format(suffix, limit)
# result[key_recalls] = stats["recalls"]
print(key_recalls, stats["recalls"])
print(key_ar, "ar={:.4f}".format(result[key_ar]))
print(key_num_pos, "num_pos={:d}".format(result[key_num_pos]))
logger.info(result)
logger.info(result)
# check_expected_results(result, expected_results, expected_results_sigma_tol)
if output_folder and save_predictions:
if box_only == 1:
torch.save(result, os.path.join(output_folder, "rpn_proposals.pth"))
elif box_only == 2:
torch.save(result, os.path.join(output_folder, "box_proposals.pth"))
else:
raise ValueError("box_only can be either 0/1/2, but get {0}".format(box_only))
return {"box_proposal": result}
pred_boxlists = []
gt_boxlists = []
for image_id, prediction in sorted(predictions.items()):
img_info = dataset.get_img_info(image_id)
if len(prediction) == 0:
continue
image_width = img_info["width"]
image_height = img_info["height"]
prediction = prediction.resize((image_width, image_height))
pred_boxlists.append(prediction)
gt_boxlist = dataset.get_groundtruth(image_id)
gt_boxlists.append(gt_boxlist)
if eval_attributes:
classes = dataset.attributes
else:
classes = dataset.classes
result = eval_detection_voc(
pred_boxlists=pred_boxlists,
gt_boxlists=gt_boxlists,
classes=classes,
iou_thresh=0.5,
eval_attributes=eval_attributes,
use_07_metric=False,
)
result_str = "mAP: {:.4f}\n".format(result["map"])
for i, ap in enumerate(result["ap"]):
# if i == 0: # skip background
# continue
# we skipped background in result['ap'], so we need to use i+1
if eval_attributes:
result_str += "{:<16}: {:.4f}\n".format(
dataset.map_attribute_id_to_attribute_name(i+1), ap
)
else:
result_str += "{:<16}: {:.4f}\n".format(
dataset.map_class_id_to_class_name(i+1), ap
)
logger.info(result_str)
# return mAP and weighted mAP
if eval_attributes:
if output_folder and save_predictions:
with open(os.path.join(output_folder, "result_attr.txt"), "w") as fid:
fid.write(result_str)
return {"attr": {"map": result["map"], "weighted map": result["weighted map"]}}
else:
if output_folder and save_predictions:
with open(os.path.join(output_folder, "result_obj.txt"), "w") as fid:
fid.write(result_str)
return {"obj": {"map": result["map"], "weighted map": result["weighted map"]}}
def eval_detection_voc(pred_boxlists, gt_boxlists, classes, iou_thresh=0.5, eval_attributes=False, use_07_metric=False):
"""Evaluate on voc dataset.
Args:
pred_boxlists(list[BoxList]): pred boxlist, has labels and scores fields.
gt_boxlists(list[BoxList]): ground truth boxlist, has labels field.
iou_thresh: iou thresh
use_07_metric: boolean
Returns:
dict represents the results
"""
assert len(gt_boxlists) == len(
pred_boxlists
), "Length of gt and pred lists need to be same."
aps = []
nposs = []
thresh = []
for i, classname in enumerate(classes):
if classname == "__background__" or classname == "__no_attribute__":
continue
rec, prec, ap, scores, npos = calc_detection_voc_prec_rec(pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, \
classindex=i, iou_thresh=iou_thresh,
eval_attributes=eval_attributes,
use_07_metric=use_07_metric)
# Determine per class detection thresholds that maximise f score
# if npos > 1:
if npos > 1 and type(scores) != np.int:
f = np.nan_to_num((prec * rec) / (prec + rec))
thresh += [scores[np.argmax(f)]]
else:
thresh += [0]
aps += [ap]
nposs += [float(npos)]
print('AP for {} = {:.4f} (npos={:,})'.format(classname, ap, npos))
# if pickle:
# with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
# cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap,
# 'scores': scores, 'npos':npos}, f)
# Set thresh to mean for classes with poor results
thresh = np.array(thresh)
avg_thresh = np.mean(thresh[thresh != 0])
thresh[thresh == 0] = avg_thresh
# if eval_attributes:
# filename = 'attribute_thresholds_' + self._image_set + '.txt'
# else:
# filename = 'object_thresholds_' + self._image_set + '.txt'
# path = os.path.join(output_dir, filename)
# with open(path, 'wt') as f:
# for i, cls in enumerate(classes[1:]):
# f.write('{:s} {:.3f}\n'.format(cls, thresh[i]))
weights = np.array(nposs)
weights /= weights.sum()
print('Mean AP = {:.4f}'.format(np.mean(aps)))
print('Weighted Mean AP = {:.4f}'.format(np.average(aps, weights=weights)))
print('Mean Detection Threshold = {:.3f}'.format(avg_thresh))
print('~~~~~~~~')
print('Results:')
for ap, npos in zip(aps, nposs):
print('{:.3f}\t{:.3f}'.format(ap, npos))
print('{:.3f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('')
print('--------------------------------------------------------------')
print('Results computed with the **unofficial** PASCAL VOC Python eval code.')
print('--------------------------------------------------------------')
# pdb.set_trace()
return {"ap": aps, "map": np.mean(aps), "weighted map": np.average(aps, weights=weights)}
def calc_detection_voc_prec_rec(pred_boxlists, gt_boxlists, classindex, iou_thresh=0.5, eval_attributes=False,
use_07_metric=False):
"""Calculate precision and recall based on evaluation code of PASCAL VOC.
This function calculates precision and recall of
predicted bounding boxes obtained from a dataset which has :math:`N`
images.
The code is based on the evaluation code used in PASCAL VOC Challenge.
"""
class_recs = {}
npos = 0
image_ids = []
confidence = []
BB = []
for image_index, (gt_boxlist, pred_boxlist) in enumerate(zip(gt_boxlists, pred_boxlists)):
pred_bbox = pred_boxlist.bbox.numpy()
gt_bbox = gt_boxlist.bbox.numpy()
if eval_attributes:
gt_label = gt_boxlist.get_field("attributes").numpy()
pred_label = pred_boxlist.get_field("attr_labels").numpy()
pred_score = pred_boxlist.get_field("attr_scores").numpy()
else:
gt_label = gt_boxlist.get_field("labels").numpy()
pred_label = pred_boxlist.get_field("labels").numpy()
pred_score = pred_boxlist.get_field("scores").numpy()
# get the ground truth information for this class
if eval_attributes:
gt_mask_l = np.array([classindex in i for i in gt_label])
else:
gt_mask_l = gt_label == classindex
gt_bbox_l = gt_bbox[gt_mask_l]
gt_difficult_l = np.zeros(gt_bbox_l.shape[0], dtype=bool)
det = [False] * gt_bbox_l.shape[0]
npos = npos + sum(~gt_difficult_l)
class_recs[image_index] = {'bbox': gt_bbox_l,
'difficult': gt_difficult_l,
'det': det}
# prediction output for each class
# pdb.set_trace()
if eval_attributes:
pred_mask_l = np.logical_and(pred_label == classindex, np.not_equal(pred_score, 0.0)).nonzero()
pred_bbox_l = pred_bbox[pred_mask_l[0]]
pred_score_l = pred_score[pred_mask_l]
else:
pred_mask_l = pred_label == classindex
pred_bbox_l = pred_bbox[pred_mask_l]
pred_score_l = pred_score[pred_mask_l]
for bbox_tmp, score_tmp in zip(pred_bbox_l, pred_score_l):
image_ids.append(image_index)
confidence.append(float(score_tmp))
BB.append([float(z) for z in bbox_tmp])
if npos == 0:
# No ground truth examples
return 0, 0, 0, 0, npos
if len(confidence) == 0:
# No detection examples
return 0, 0, 0, 0, npos
confidence = np.array(confidence)
BB = np.array(BB)
# sort by confidence
sorted_ind = np.argsort(-confidence)
sorted_scores = -np.sort(-confidence)
BB = BB[sorted_ind, :]
image_ids = [image_ids[x] for x in sorted_ind]
# go down dets and mark TPs and FPs
nd = len(image_ids)
tp = np.zeros(nd)
fp = np.zeros(nd)
for d in range(nd):
R = class_recs[image_ids[d]]
bb = BB[d, :].astype(float)
ovmax = -np.inf
BBGT = R['bbox'].astype(float)
if BBGT.size > 0:
# compute overlaps
# intersection
ixmin = np.maximum(BBGT[:, 0], bb[0])
iymin = np.maximum(BBGT[:, 1], bb[1])
ixmax = np.minimum(BBGT[:, 2], bb[2])
iymax = np.minimum(BBGT[:, 3], bb[3])
iw = np.maximum(ixmax - ixmin + 1., 0.)
ih = np.maximum(iymax - iymin + 1., 0.)
inters = iw * ih
# union
uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
(BBGT[:, 2] - BBGT[:, 0] + 1.) *
(BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
overlaps = inters / uni
ovmax = np.max(overlaps)
jmax = np.argmax(overlaps)
if ovmax > iou_thresh:
if not R['difficult'][jmax]:
if not R['det'][jmax]:
tp[d] = 1.
R['det'][jmax] = 1
else:
fp[d] = 1.
else:
fp[d] = 1.
# compute precision recall
fp = np.cumsum(fp)
tp = np.cumsum(tp)
rec = tp / float(npos)
# avoid divide by zero in case the first detection matches a difficult
# ground truth
prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
ap = voc_ap(rec, prec, use_07_metric)
return rec, prec, ap, sorted_scores, npos
def voc_ap(rec, prec, use_07_metric=False):
""" ap = voc_ap(rec, prec, [use_07_metric])
Compute VOC AP given precision and recall.
If use_07_metric is true, uses the
VOC 07 11 point method (default:False).
"""
if use_07_metric:
# 11 point metric
ap = 0.
for t in np.arange(0., 1.1, 0.1):
if np.sum(rec >= t) == 0:
p = 0
else:
p = np.max(prec[rec >= t])
ap = ap + p / 11.
else:
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.], rec, [1.]))
mpre = np.concatenate(([0.], prec, [0.]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
def calc_detection_voc_ap(prec, rec, use_07_metric=False):
"""Calculate average precisions based on evaluation code of PASCAL VOC.
This function calculates average precisions
from given precisions and recalls.
The code is based on the evaluation code used in PASCAL VOC Challenge.
Args:
prec (list of numpy.array): A list of arrays.
:obj:`prec[l]` indicates precision for class :math:`l`.
If :obj:`prec[l]` is :obj:`None`, this function returns
:obj:`numpy.nan` for class :math:`l`.
rec (list of numpy.array): A list of arrays.
:obj:`rec[l]` indicates recall for class :math:`l`.
If :obj:`rec[l]` is :obj:`None`, this function returns
:obj:`numpy.nan` for class :math:`l`.
use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
for calculating average precision. The default value is
:obj:`False`.
Returns:
~numpy.ndarray:
This function returns an array of average precisions.
The :math:`l`-th value corresponds to the average precision
for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
:obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
"""
n_fg_class = len(prec)
ap = np.empty(n_fg_class)
for l in range(n_fg_class):
if prec[l] is None or rec[l] is None:
ap[l] = np.nan
continue
if use_07_metric:
# 11 point metric
ap[l] = 0
for t in np.arange(0.0, 1.1, 0.1):
if np.sum(rec[l] >= t) == 0:
p = 0
else:
p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
ap[l] += p / 11
else:
# correct AP calculation
# first append sentinel values at the end
mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
mrec = np.concatenate(([0], rec[l], [1]))
mpre = np.maximum.accumulate(mpre[::-1])[::-1]
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap

Просмотреть файл

@ -11,6 +11,11 @@ from .utils.label_loader import LabelLoader
from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
def sort_key_by_val(dic):
sorted_dic = sorted(dic.items(), key=lambda kv: kv[1])
return [kv[0] for kv in sorted_dic]
class RelationTSVDataset(TSVYamlDataset):
"""
Generic TSV dataset format for Object Detection.
@ -28,7 +33,7 @@ class RelationTSVDataset(TSVYamlDataset):
self.contrastive_loss_on = kwargs['args'].MODEL.ROI_RELATION_HEAD.CONTRASTIVE_LOSS.USE_FLAG if kwargs['args'] is not None else False
# construct maps
jsondict_file = find_file_path_in_yaml(self.cfg.get("labelmap", None), self.root)
jsondict_file = find_file_path_in_yaml(self.cfg.get("labelmap", self.cfg.get("jsondict", None)), self.root) # previous version use jsondict
jsondict = json.load(open(jsondict_file, 'r'))
self.labelmap = {}
@ -37,18 +42,21 @@ class RelationTSVDataset(TSVYamlDataset):
self.class_to_ind['__background__'] = 0
self.ind_to_class = {v:k for k, v in self.class_to_ind.items()}
self.labelmap['class_to_ind'] = self.class_to_ind
self.classes = sort_key_by_val(self.class_to_ind)
if self.attribute_on:
self.attribute_to_ind = jsondict['attribute_to_idx']
self.attribute_to_ind['__no_attribute__'] = 0
self.ind_to_attribute = {v:k for k, v in self.attribute_to_ind.items()}
self.labelmap['attribute_to_ind'] = self.attribute_to_ind
self.attributes = sort_key_by_val(self.attribute_to_ind)
if self.relation_on:
self.relation_to_ind = jsondict['predicate_to_idx']
self.relation_to_ind['__no_relation__'] = 0
self.ind_to_relation = {v:k for k, v in self.relation_to_ind.items()}
self.labelmap['relation_to_ind'] = self.relation_to_ind
self.relations = sort_key_by_val(self.relation_to_ind)
if self.is_load_label or self.detector_pre_calculated:
self.label_loader = LabelLoader(

Просмотреть файл

@ -237,7 +237,29 @@ def inference(
total_timer = Timer()
inference_timer = Timer()
total_timer.tic()
predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer)
output_pth_name = 'predictions_forcebox.pth' if eval_attributes else 'predictions.pth'
if output_folder and os.path.isfile(os.path.join(output_folder, output_pth_name)):
logger.info("Predictions.pth file exist in {}, skip computation".format(
os.path.join(output_folder, output_pth_name)))
if not is_main_process():
return
if cfg.TEST.SAVE_RESULTS_TO_TSV or not cfg.TEST.SKIP_PERFORMANCE_EVAL:
predictions = torch.load(os.path.join(output_folder, output_pth_name))
else:
if eval_attributes:
# change to force_boxes=True mode
force_boxes_model = model.force_boxes
force_boxes_box = model.roi_heads.box.post_processor.force_boxes
model.force_boxes = True
model.roi_heads.box.post_processor.force_boxes = True
predictions = compute_on_dataset(model, data_loader, device, bbox_aug,
inference_timer)
# return to the original state
model.force_boxes = force_boxes_model
model.roi_heads.box.post_processor.force_boxes = force_boxes_box
else:
predictions = compute_on_dataset(model, data_loader, device, bbox_aug, inference_timer)
# wait for all processes to complete before measuring the time
synchronize()
total_time = total_timer.toc()
@ -262,7 +284,7 @@ def inference(
return
if output_folder and save_predictions:
torch.save(predictions, os.path.join(output_folder, "predictions.pth"))
torch.save(predictions, os.path.join(output_folder, output_pth_name))
if output_folder and cfg.TEST.SAVE_RESULTS_TO_TSV:
logger.info("Convert prediction results to tsv format and save.")
@ -281,11 +303,16 @@ def inference(
extra_args = dict(
box_only=box_only,
eval_attributes=eval_attributes,
iou_types=iou_types,
expected_results=expected_results,
expected_results_sigma_tol=expected_results_sigma_tol,
save_predictions=save_predictions
)
if hasattr(cfg.MODEL, 'RELATION_ON'):
extra_args['sg_eval'] = cfg.MODEL.RELATION_ON
else:
extra_args['sg_eval'] = False
return evaluate(dataset=dataset,
predictions=predictions,

Просмотреть файл

@ -13,8 +13,8 @@ from maskrcnn_benchmark.data import make_data_loader
from maskrcnn_benchmark.utils.comm import get_world_size, synchronize
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
from maskrcnn_benchmark.engine.inference import inference
from maskrcnn_benchmark.utils.amp import autocast, GradScaler
from apex import amp
def reduce_loss_dict(loss_dict):
"""
@ -63,6 +63,9 @@ def do_train(
start_training_time = time.time()
end = time.time()
if cfg.SOLVER.USE_AMP:
scaler = GradScaler()
iou_types = ("bbox",)
if cfg.MODEL.MASK_ON:
iou_types = iou_types + ("segm",)
@ -84,7 +87,11 @@ def do_train(
images = images.to(device)
# targets = [target.to(device) for target in targets]
loss_dict = model(images, targets)
if cfg.SOLVER.USE_AMP:
with autocast():
loss_dict = model(images, targets)
else:
loss_dict = model(images, targets)
# take care of additional metric besides loss returned from model
if type(loss_dict) == tuple:
@ -101,12 +108,13 @@ def do_train(
meters.update(loss=losses_reduced, **loss_dict_reduced)
optimizer.zero_grad()
# # Note: If mixed precision is not used, this ends up doing nothing
# # Otherwise apply loss scaling for mixed-precision recipe
# with amp.scale_loss(losses, optimizer) as scaled_losses:
# scaled_losses.backward()
losses.backward()
optimizer.step()
if cfg.SOLVER.USE_AMP:
scaler.scale(losses).backward()
scaler.step(optimizer)
scaler.update()
else:
losses.backward()
optimizer.step()
scheduler.step()
batch_time = time.time() - end
@ -116,7 +124,7 @@ def do_train(
eta_seconds = meters.time.global_avg * (max_iter - iteration)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if iteration % 1 == 0 or iteration == max_iter:
if iteration % cfg.LOG_LOSS_PERIOD == 0 or iteration == max_iter:
logger.info(
meters.delimiter.join(
[

Просмотреть файл

@ -2,6 +2,9 @@
import torch
from torch import nn
import torch.distributed as dist
import maskrcnn_benchmark.utils.comm as comm
from torch.autograd.function import Function
class FrozenBatchNorm2d(nn.Module):
"""
@ -17,15 +20,98 @@ class FrozenBatchNorm2d(nn.Module):
self.register_buffer("running_var", torch.ones(n))
def forward(self, x):
# Cast all fixed parameters to half() if necessary
if x.dtype == torch.float16:
self.weight = self.weight.half()
self.bias = self.bias.half()
self.running_mean = self.running_mean.half()
self.running_var = self.running_var.half()
scale = self.weight * self.running_var.rsqrt()
bias = self.bias - self.running_mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
return x * scale + bias
class AllReduce(Function):
@staticmethod
def forward(ctx, input):
input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
# Use allgather instead of allreduce since I don't trust in-place operations ..
dist.all_gather(input_list, input, async_op=False)
inputs = torch.stack(input_list, dim=0)
return torch.sum(inputs, dim=0)
@staticmethod
def backward(ctx, grad_output):
dist.all_reduce(grad_output, async_op=False)
return grad_output
class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
"""
In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
when the batch size on each worker is different.
(e.g., when scale augmentation is used, or when it is applied to mask head).
This is a slower but correct alternative to `nn.SyncBatchNorm`.
Note:
There isn't a single definition of Sync BatchNorm.
When ``stats_mode==""``, this module computes overall statistics by using
statistics of each worker with equal weight. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (N, H, W). This mode does not support inputs with zero batch size.
When ``stats_mode=="N"``, this module computes overall statistics by weighting
the statistics of each worker by their ``N``. The result is true statistics
of all samples (as if they are all on one worker) only when all workers
have the same (H, W). It is slower than ``stats_mode==""``.
Even though the result of this module may not be the true statistics of all samples,
it may still be reasonable because it might be preferrable to assign equal weights
to all workers, regardless of their (H, W) dimension, instead of putting larger weight
on larger images. From preliminary experiments, little difference is found between such
a simplified implementation and an accurate computation of overall mean & variance.
"""
def __init__(self, *args, stats_mode="", **kwargs):
super().__init__(*args, **kwargs)
assert stats_mode in ["", "N"]
self._stats_mode = stats_mode
def forward(self, input):
if comm.get_world_size() == 1 or not self.training:
return super().forward(input)
B, C = input.shape[0], input.shape[1]
mean = torch.mean(input, dim=[0, 2, 3])
meansqr = torch.mean(input * input, dim=[0, 2, 3])
if self._stats_mode == "":
assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
vec = torch.cat([mean, meansqr], dim=0)
vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
mean, meansqr = torch.split(vec, C)
momentum = self.momentum
else:
if B == 0:
vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
vec = vec + input.sum() # make sure there is gradient w.r.t input
else:
vec = torch.cat(
[mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0
)
vec = AllReduce.apply(vec * B)
total_batch = vec[-1].detach()
momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0
total_batch = torch.max(total_batch, torch.ones_like(total_batch)) # avoid div-by-zero
mean, meansqr, _ = torch.split(vec / total_batch, C)
var = meansqr - mean * mean
invstd = torch.rsqrt(var + self.eps)
scale = self.weight * invstd
bias = self.bias - mean * scale
scale = scale.reshape(1, -1, 1, 1)
bias = bias.reshape(1, -1, 1, 1)
self.running_mean += momentum * (mean.detach() - self.running_mean)
self.running_var += momentum * (var.detach() - self.running_var)
return input * scale + bias

Просмотреть файл

@ -1,12 +1,12 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# from ._utils import _C
from maskrcnn_benchmark import _C
# from apex import amp
try:
import torchvision
from torchvision.ops import nms
except:
nms = _C.nms
# Only valid with fp32 inputs - give AMP the hint
# nms = amp.float_function(_C.nms)
nms = _C.nms
# nms.__doc__ = """
# This function performs Non-maximum suppresion"""

Просмотреть файл

@ -7,8 +7,6 @@ from torch.nn.modules.utils import _pair
from maskrcnn_benchmark import _C
# from apex import amp
class _ROIAlign(Function):
@staticmethod
def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
@ -44,8 +42,11 @@ class _ROIAlign(Function):
)
return grad_input, None, None, None, None
roi_align = _ROIAlign.apply
try:
import torchvision
from torchvision.ops import roi_align
except:
roi_align = _ROIAlign.apply
class ROIAlign(nn.Module):
def __init__(self, output_size, spatial_scale, sampling_ratio):
@ -54,7 +55,6 @@ class ROIAlign(nn.Module):
self.spatial_scale = spatial_scale
self.sampling_ratio = sampling_ratio
# @amp.float_function
def forward(self, input, rois):
return roi_align(
input, rois, self.output_size, self.spatial_scale, self.sampling_ratio

Просмотреть файл

@ -7,7 +7,6 @@ from torch.nn.modules.utils import _pair
from maskrcnn_benchmark import _C
# from apex import amp
class _ROIPool(Function):
@staticmethod
@ -53,7 +52,6 @@ class ROIPool(nn.Module):
self.output_size = output_size
self.spatial_scale = spatial_scale
# @amp.float_function
def forward(self, input, rois):
return roi_pool(input, rois, self.output_size, self.spatial_scale)

Просмотреть файл

@ -57,7 +57,6 @@ class SigmoidFocalLoss(nn.Module):
self.alpha = alpha
def forward(self, logits, targets):
device = logits.device
if logits.is_cuda:
loss_func = sigmoid_focal_loss_cuda
else:

Просмотреть файл

@ -8,6 +8,7 @@ from maskrcnn_benchmark.modeling import registry
from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
from . import fpn as fpn_module
from . import resnet
from .msvit import build_msvit_backbone
@registry.BACKBONES.register("R-50-C4")
@ -73,6 +74,15 @@ def build_resnet_fpn_p3p7_backbone(cfg):
return model
@registry.BACKBONES.register("ViL-C4")
def build_vilc4_backbone(cfg):
assert len(cfg.MODEL.TRANSFORMER.OUT_FEATURES) == 1, "The number of OUT_FEATURES in ViL-C4 is not 1!"
body = build_msvit_backbone(cfg)
model = nn.Sequential(OrderedDict([("body", body)]))
model.out_channels = body.out_planes
return model
def build_backbone(cfg):
assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
"cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(

Просмотреть файл

@ -0,0 +1,286 @@
# Copyright (c) 2021 Microsoft Corporation. Licensed under the MIT license.
# Written by Pengchuan Zhang, penzhan@microsoft.com
import random
import torch
from torch import nn, einsum
import torch.nn.functional as F
from einops import rearrange
from timm.models.layers import trunc_normal_
from .slidingchunk_2d import slidingchunk_2d, mask_invalid_locations, slidingchunk_2dautograd
class Long2DSCSelfAttention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., w=7, d=1,
autoregressive=False, sharew=False, nglo=1, only_glo=False, exact=0, autograd=False, rpe=False,
mode=0):
super().__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = qk_scale or self.head_dim ** -0.5
self.Nglo = nglo
self.only_glo = only_glo
if self.only_glo:
assert self.Nglo >= 1, "Nglo == 0 in the only global mode!"
self.query = nn.Linear(dim, dim, bias=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.proj = nn.Linear(dim, dim)
if nglo >= 1:
if sharew:
self.query_global = self.query
self.kv_global = self.kv
self.proj_global = self.proj
else:
self.query_global = nn.Linear(dim, dim, bias=qkv_bias)
self.kv_global = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.proj_global = nn.Linear(dim, dim)
self.attn_drop = nn.Dropout(attn_drop)
self.proj_drop = nn.Dropout(proj_drop)
self.attention_window = w
self.attention_dilation = d
self.autoregressive = autoregressive
assert self.attention_dilation == 1, "Dilation is not supported!"
assert not self.autoregressive, "Autoregressive is not supported yet!"
self.exact = exact
# use autograd or handgrad
self.longform2d_mm = slidingchunk_2dautograd if autograd else slidingchunk_2d
# Inspired by swin transformer:
# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L88-L103
# define parameter tables for local and global relative position bias
self.rpe = rpe
if rpe:
self.local_relative_position_bias_table = nn.Parameter(
torch.zeros((2 * 2 * w - 1) * (2 * 2 * w - 1), num_heads)) # (4*w-1, 4*w-1, nH)
trunc_normal_(self.local_relative_position_bias_table, std=.02)
if nglo >= 1:
self.g2l_relative_position_bias = nn.Parameter(
torch.zeros(2, num_heads, nglo)) # (2, nH, nglo)
self.g2g_relative_position_bias = nn.Parameter(
torch.zeros(num_heads, nglo, nglo)) # (nH, nglo, nglo)
trunc_normal_(self.g2l_relative_position_bias, std=.02)
trunc_normal_(self.g2g_relative_position_bias, std=.02)
# get pair-wise relative position index
coords_h = torch.arange(-w, 2*w)
coords_w = torch.arange(-w, 2*w)
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, 3w, 3w
coords_unfold = rearrange(
coords, 'c (m x) (n y) -> c m n (x y)', x=w, y=w
) # 2, 3, 3, 9w^2
q_coords = coords_unfold[:, 1, 1, :] # 2, w^2
relative_coords = torch.cat([
# -1, -1
q_coords[:, :, None] - coords_unfold[:, 0, 0, :][:, None, :],
# -1, 0
q_coords[:, :, None] - coords_unfold[:, 0, 1, :][:, None, :],
# -1, 1
q_coords[:, :, None] - coords_unfold[:, 0, 2, :][:, None, :],
# 0,-1
q_coords[:, :, None] - coords_unfold[:, 1, 0, :][:, None, :],
# 0,0
q_coords[:, :, None] - q_coords[:, None, :],
# 0,1
q_coords[:, :, None] - coords_unfold[:, 1, 2, :][:, None, :],
# 1, -1
q_coords[:, :, None] - coords_unfold[:, 2, 0, :][:, None, :],
# 1, 0
q_coords[:, :, None] - coords_unfold[:, 2, 1, :][:, None, :],
# 1, 1
q_coords[:, :, None] - coords_unfold[:, 2, 2, :][:, None, :],
], dim=-1) # 2, w^2, 9w^2
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # w^2, 9w^2, 2
relative_coords[:, :, 0] += 2 * w - 1 # shift to start from 0
relative_coords[:, :, 1] += 2 * w - 1
relative_coords[:, :, 0] *= 2 * 2 * w - 1
relative_position_index = relative_coords.sum(-1) # w^2, 9w^2
self.register_buffer("relative_position_index", relative_position_index)
# mode to control the sampling strategy of neighbor blocks
# 0: all 8 blocks; -1: no neighbor block; >0: random sample one block
self.mode = mode
def forward(self, x, nx, ny):
B, N, C = x.shape
Nloc = nx * ny
Nglo, H, M, W = self.Nglo, self.num_heads, self.head_dim, self.attention_window
W2 = W ** 2
assert Nglo + Nloc == N, "Global dimension does not match!"
# get the mode of the longformer attention
mode = self.mode
kv_nums = 9 * W2
if self.mode > 0:
if self.training:
mode = random.randrange(1, 9) # 1 <= mode <= 8
kv_nums = 2 * W2
else:
mode = 0 # full during evaluation
elif mode == -1:
kv_nums = W2
# compute the local attention
q = self.scale * self.query(x[:, Nglo:]).reshape(B, Nloc, H, M).transpose(1, 2).contiguous()
kv = self.kv(x).reshape(B, N, 2, H, M).permute(2, 0, 3, 1, 4)
k, v = kv[0], kv[1] # make torchscript happy (cannot use tensor as tuple)
if self.only_glo:
# local to global attn10: (B, self.num_heads, Nloc, Nglo)
attn1 = torch.bmm(q.view(B*H, Nloc, M), k[:, :, :Nglo].reshape(B*H, Nglo, M).transpose(-2, -1)).view(B, H, Nloc, Nglo)
else:
(q_img, k_img, v_img) = map(
lambda t: rearrange(t, 'b h (x y) c -> (b h) c x y', x=nx),
(q, k[:, :, Nglo:], v[:, :, Nglo:]))
# pad 0's to make sure that nx % W == 0, ny % W == 0
(padx, pady) = map(lambda t: (W - t % W) % W, (nx, ny))
(mx, my) = map(lambda t: (t[0] + t[1]) // W,
((nx, padx), (ny, pady)))
if padx > 0 or pady > 0:
(q_img, k_img, v_img) = map(
lambda t: F.pad(t, (0, pady, 0, padx)), (q_img, k_img, v_img)
)
# unfold the padded tensor
(q_img, k_img, v_img) = map(
lambda t: rearrange(t, 'b c (m x) (n y) -> b c m n (x y)', x=W, y=W),
(q_img, k_img, v_img)
)
# local to global attn10: (B*H, mx, my, w^2, Nglo)
attn10 = einsum('b c m n l, b t c -> b m n l t', q_img,
k[:, :, :Nglo].reshape(B*H, Nglo, M))
# local to local attn11 (B*H, mx, my, W**2, 9*W**2), mode = 0
# attn11 (B*H, mx, my, W**2, W**2), mode = -1
# attn11 (B*H, mx, my, W**2, 2*W**2), mode > 0
attn11 = self.longform2d_mm(q_img, k_img, False, mode)
if self.rpe:
if Nglo >= 1:
# local to global bias
attn10 = attn10 + self.g2l_relative_position_bias[1].unsqueeze(0).expand(B, -1, -1).reshape(B*H, Nglo)[:, None, None, None, :]
# local to local bias
if mode == -1:
relative_position_index = self.relative_position_index[:, 4 * W2:5 * W2].contiguous()
elif mode == 0:
relative_position_index = self.relative_position_index
else: # mode > 0
chunk_id = mode if mode > 4 else mode - 1
relative_position_index = torch.cat([
self.relative_position_index[:, 4 * W2:5 * W2],
self.relative_position_index[:, chunk_id * W2:(chunk_id+1) * W2],
], dim=-1)
local_relative_position_bias = self.local_relative_position_bias_table[
relative_position_index.view(-1)].view(1, W2, kv_nums, -1) # w^2, kv_nums,H
local_relative_position_bias = local_relative_position_bias.permute(
0, 3, 1, 2).expand(B, -1, -1, -1).contiguous().view(B*H, W2, kv_nums) # B*H, w^2, kv_nums
attn11 = attn11 + local_relative_position_bias[:, None, None, :, :]
num_invalid = mask_invalid_locations(
attn11, mx, my, padx, pady, W, exact=self.exact, mode=mode
)
attn1 = torch.cat((attn10, attn11), dim=-1)
attn1 = (attn1 - torch.max(attn1, dim=-1, keepdim=True)[0]).softmax(dim=-1)
attn1 = self.attn_drop(attn1)
# update x1: (B, self.num_heads, Nloc, self.head_dim)
if self.only_glo:
x1 = torch.bmm(
attn1.view(B * H, Nloc, Nglo), v[:, :, :Nglo].reshape(B * H, Nglo, M)
).view(B, H, Nloc, M)
else:
attnl2g = attn1[:, :, :, :, :Nglo]
x1 = self.longform2d_mm(attn1[:, :, :, :, Nglo:Nglo+kv_nums], v_img, True, mode)
if Nglo >= 1:
x1 = x1 + einsum(
'b m n l t, b t c -> b c m n l', attnl2g,
v[:, :, :Nglo].reshape(B * H, Nglo, M)
)
x1 = rearrange(x1, 'b c m n (x y) -> b (m x) (n y) c', x=W)
x1 = x1[:, :nx, :ny].reshape(B, H, Nloc, M)
x1 = x1.transpose(1, 2).reshape(B, Nloc, C)
try:
x1 = self.proj(x1)
except RuntimeError as e:
# guard against possible half vs float error
x1 = self.proj(x1.float())
if Nglo == 0:
return self.proj_drop(x1)
# compute the glocal attention; same with vanilla multi-head attention
q_global = self.scale * self.query_global(x[:, :Nglo]).reshape(B, Nglo, H, M).transpose(1, 2)
kv_global = self.kv_global(x).reshape(B, N, 2, H, M).permute(2, 0, 3, 1, 4)
k_global, v_global = kv_global[0], kv_global[1] # make torchscript happy (cannot use tensor as tuple)
# attention matrix
attn0 = torch.bmm(q_global.reshape(B*H, Nglo, M), k_global.reshape(B*H, N, M).transpose(-2, -1))
if self.rpe:
# relative position embedding of global tokens
global_relative_position_bias = torch.cat([
self.g2g_relative_position_bias,
self.g2l_relative_position_bias[0].unsqueeze(-1).expand(-1, -1, Nloc)
], dim=-1) # nH, nglo, N
attn0 = attn0 + global_relative_position_bias.unsqueeze(0).expand(B, -1, -1, -1).reshape(B*H, Nglo, N)
attn0 = (attn0 - torch.max(attn0, dim=-1, keepdim=True)[0]).softmax(dim=-1)
attn0 = self.attn_drop(attn0)
# context vector
x0 = torch.bmm(attn0, v_global.reshape(B*H, N, M)).view(B, H, Nglo, M).transpose(1, 2).reshape(B, Nglo, C)
x0 = self.proj_global(x0)
return self.proj_drop(torch.cat((x0, x1), dim=1))
@staticmethod
def compute_macs(module, input, output):
# T: num_token
# S: num_token
input = input[0]
_, T, C = input.shape
S = T
Nglo, H, M, W = module.Nglo, module.num_heads, module.head_dim, module.attention_window
macs = 0
n_params = 0
# Sliding window scaled-dot-product macs
if module.only_glo:
# local to global
# [B x T x (C-Nglo)] x [B x C x Nglo] --> [B x T x Nglo]
num_macs_kq = (C - Nglo) * Nglo * C
else:
# local to local
# [B x T x (C-Nglo)] x [B x C x (S-Nglo)] --> [B x (C-Nglo) x (9 * W**2)]
num_macs_kq = (C-Nglo) * (9 * W**2) * C
# local to global
# [B x T x (C-Nglo)] x [B x C x Nglo] --> [B x T x Nglo]
num_macs_kq += (C-Nglo) * Nglo * C
# global to all
# [B x T x Nglo] x [B x C x S] --> [B x Nglo x S]
num_macs_kq += Nglo * S * C
# same computational cost for attn * v -> context
num_macs_v = num_macs_kq
macs += num_macs_kq + num_macs_v
# print('macs att', macs / 1e8)
# self attention: T should be equal to S
assert T == S
# by default, we share weights for local and global tokens
q_params = sum([p.numel() for p in module.query.parameters()])
kv_params = sum([p.numel() for p in module.kv.parameters()])
n_params += q_params + kv_params
# multiply by Seq length
macs += (q_params + kv_params) * T
# print('macs qkv', qkv_params * T / 1e8)
# by default, we share weights for local and global tokens
proj_params = sum([p.numel() for p in module.proj.parameters()])
n_params += proj_params
macs += (proj_params * T)
# print('macs proj', proj_params * T / 1e8)
module.__flops__ += macs
# return n_params, macs

Просмотреть файл

@ -0,0 +1,657 @@
import math
from functools import partial
import logging
import torch
from torch import nn
from timm.models.layers import DropPath, trunc_normal_, to_2tuple
from .longformer2d import Long2DSCSelfAttention
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None,
act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None,
attn_drop=0., proj_drop=0.,
rpe=False, wx=14, wy=14, nglo=1):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
# Inspired by swin transformer:
# https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py#L88-L103
# define parameter tables for local and global relative position bias
self.rpe = rpe
if rpe:
self.wx = wx
self.wy = wy
self.nglo = nglo
self.local_relative_position_bias_table = nn.Parameter(
torch.zeros((2 * wx - 1) * (2 * wy - 1),
num_heads)) # (2*wx-1, 2*wy-1, nH)
trunc_normal_(self.local_relative_position_bias_table, std=.02)
if nglo >= 1:
self.g2l_relative_position_bias = nn.Parameter(
torch.zeros(2, num_heads, nglo)) # (2, nH, nglo)
self.g2g_relative_position_bias = nn.Parameter(
torch.zeros(num_heads, nglo, nglo)) # (nH, nglo, nglo)
trunc_normal_(self.g2l_relative_position_bias, std=.02)
trunc_normal_(self.g2g_relative_position_bias, std=.02)
# get pair-wise relative position index
coords_h = torch.arange(wx)
coords_w = torch.arange(wy)
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, wx, wy
coords_flatten = torch.flatten(coords, 1) # 2, Wx*Wy
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wx*Wy, Wx*Wy
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wx*Wy, Wx*Wy, 2
relative_coords[:, :, 0] += wx - 1 # shift to start from 0
relative_coords[:, :, 1] += wy - 1
relative_coords[:, :, 0] *= 2 * wy - 1
relative_position_index = relative_coords.sum(-1) # Wx*Wy, Wx*Wy
self.register_buffer("relative_position_index", relative_position_index)
def forward(self, x, nx=None, ny=None):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
if self.rpe:
assert N == self.nglo + self.wx*self.wy, "For relative position, N != self.nglo + self.wx*self.wy!"
local_relative_position_bias = self.local_relative_position_bias_table[
self.relative_position_index.view(-1)].view(
self.wx*self.wy, self.wx*self.wy, -1) # Wh*Ww, Wh*Ww,nH
relative_position_bias = local_relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
if self.nglo > 0:
# relative position embedding of global tokens
global_relative_position_bias = torch.cat([
self.g2g_relative_position_bias,
self.g2l_relative_position_bias[0].unsqueeze(-1).expand(-1, -1, self.wx*self.wy)
], dim=-1) # nH, nglo, N
# relative position embedding of local tokens
local_relative_position_bias = torch.cat([
self.g2l_relative_position_bias[1].unsqueeze(1).expand(-1, self.wx*self.wy, -1),
relative_position_bias,
], dim=-1) # nH, Wh*Ww, N
relative_position_bias = torch.cat([
global_relative_position_bias,
local_relative_position_bias,
], dim=1) # nH, N, N
attn = attn + relative_position_bias.unsqueeze(0)
attn = (attn - torch.max(attn, dim=-1, keepdim=True)[0]).softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
@staticmethod
def compute_macs(module, input, output):
# T: num_token
# S: num_token
input = input[0]
_, T, C = input.shape
S = T
macs = 0
n_params = 0
# Scaled-dot-product macs
# [B x T x C] x [B x C x S] --> [B x T x S]
# multiplication-addition is counted as 1 because operations can be fused
num_macs_kq = T * S * C
# [B x T x S] x [B x S x C] --> [B x T x C]
num_macs_v = T * C * S
macs += num_macs_kq + num_macs_v
# print('macs att', macs / 1e8)
# self attention: T should be equal to S
assert T == S
qkv_params = sum([p.numel() for p in module.qkv.parameters()])
n_params += qkv_params
# multiply by Seq length
macs += qkv_params * T
# print('macs qkv', qkv_params * T / 1e8)
proj_params = sum([p.numel() for p in module.proj.parameters()])
n_params += proj_params
macs += (proj_params * T)
# print('macs proj', proj_params * T / 1e8)
module.__flops__ += macs
# return n_params, macs
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, patch_size, nx, ny, in_chans=3, embed_dim=768, nglo=1,
norm_layer=partial(nn.LayerNorm, eps=1e-6), norm_embed=True,
drop_rate=0.0, ape=True):
# maximal global/x-direction/y-direction tokens: nglo, nx, ny
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size,
stride=patch_size)
self.norm_embed = norm_layer(embed_dim) if norm_embed else None
self.nx = nx
self.ny = ny
self.Nglo = nglo
if nglo >= 1:
self.cls_token = nn.Parameter(torch.zeros(1, nglo, embed_dim))
trunc_normal_(self.cls_token, std=.02)
else:
self.cls_token = None
self.ape = ape
if ape:
self.cls_pos_embed = nn.Parameter(torch.zeros(1, nglo, embed_dim))
self.x_pos_embed = nn.Parameter(torch.zeros(1, nx, embed_dim // 2))
self.y_pos_embed = nn.Parameter(torch.zeros(1, ny, embed_dim // 2))
trunc_normal_(self.cls_pos_embed, std=.02)
trunc_normal_(self.x_pos_embed, std=.02)
trunc_normal_(self.y_pos_embed, std=.02)
self.pos_drop = nn.Dropout(p=drop_rate)
def forward(self, xtuple):
x, nx, ny = xtuple
B = x.shape[0]
x = self.proj(x)
nx, ny = x.shape[-2:]
x = x.flatten(2).transpose(1, 2)
assert nx <= self.nx and ny <= self.ny, "Input size {} {} should <= {} {}!".format(nx, ny, self.nx, self.ny)
if self.norm_embed:
x = self.norm_embed(x)
# concat cls_token
if self.cls_token is not None:
cls_tokens = self.cls_token.expand(
B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)
if self.ape:
# add position embedding
i = torch.arange(nx, device=x.device)
j = torch.arange(ny, device=x.device)
x_emb = self.x_pos_embed[:, i, :]
y_emb = self.y_pos_embed[:, j, :]
pos_embed_2d = torch.cat([
x_emb.unsqueeze(2).expand(-1, -1, ny, -1),
y_emb.unsqueeze(1).expand(-1, nx, -1, -1),
], dim=-1).flatten(start_dim=1, end_dim=2)
x = x + torch.cat([self.cls_pos_embed, pos_embed_2d], dim=1).expand(
B, -1, -1)
x = self.pos_drop(x)
return x, nx, ny
def init_(tensor):
dim = tensor.shape[-1]
std = 1 / math.sqrt(dim)
tensor.uniform_(-std, std)
return tensor
# for Performer, start
def get_module_device(module):
return next(module.parameters()).device
def find_modules(nn_module, type):
return [module for module in nn_module.modules() if isinstance(module, type)]
# for Performer, end
class AttnBlock(nn.Module):
""" Meta Attn Block
"""
def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0.,
attn_drop=0.,
drop_path=0., norm_layer=nn.LayerNorm,
attn_type='full', w=7, d=1, sharew=False, nglo=1,
only_glo=False,
seq_len=None, num_feats=256, share_kv=False, sw_exact=0,
rratio=2, rpe=False, wx=14, wy=14,
mode=0):
super().__init__()
self.norm = norm_layer(dim)
if attn_type == 'full':
self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
qk_scale=qk_scale, attn_drop=attn_drop,
proj_drop=drop,
rpe=rpe, wx=wx, wy=wy, nglo=nglo)
elif attn_type == 'longformerhand':
self.attn = Long2DSCSelfAttention(
dim, exact=sw_exact, num_heads=num_heads, qkv_bias=qkv_bias,
qk_scale=qk_scale, attn_drop=attn_drop,
proj_drop=drop, w=w, d=d, sharew=sharew,
nglo=nglo, only_glo=only_glo, autograd=False,
rpe=rpe, mode=mode
)
elif attn_type == 'longformerauto':
self.attn = Long2DSCSelfAttention(
dim, exact=sw_exact, num_heads=num_heads, qkv_bias=qkv_bias,
qk_scale=qk_scale, attn_drop=attn_drop,
proj_drop=drop, w=w, d=d, sharew=sharew,
nglo=nglo, only_glo=only_glo, autograd=True,
rpe=rpe, mode=mode
)
else:
raise ValueError(
"Not supported attention type {}".format(attn_type))
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
def forward(self, xtuple):
x, nx, ny = xtuple
x = x + self.drop_path(self.attn(self.norm(x), nx, ny))
return x, nx, ny
class MlpBlock(nn.Module):
""" Meta MLP Block
"""
def __init__(self, dim, out_dim=None, mlp_ratio=4., drop=0., drop_path=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.drop_path = DropPath(
drop_path) if drop_path > 0. else nn.Identity()
self.norm = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
out_features=out_dim, act_layer=act_layer, drop=drop)
self.shortcut = nn.Identity()
if out_dim is not None and out_dim != dim:
self.shortcut = nn.Sequential(nn.Linear(dim, out_dim),
nn.Dropout(drop))
def forward(self, xtuple):
x, nx, ny = xtuple
x = self.shortcut(x) + self.drop_path(self.mlp(self.norm(x)))
return x, nx, ny
def parse_arch(layer_cfgstr):
layer_cfg = {'l': 1, 'h': 3, 'd': 192, 'n': 1, 's': 1, 'g': 1,
'p': 2, 'f': 7, 'a': 0} # defaults
for attr in layer_cfgstr.split(','):
layer_cfg[attr[0]] = int(attr[1:])
return layer_cfg
class MsViT(nn.Module):
""" Multiscale Vision Transformer with support for patch or hybrid CNN input stage
"""
def __init__(self, arch, img_size=512, in_chans=3,
num_classes=1000,
qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
norm_embed=False, w=7, d=1, sharew=False, only_glo=False,
share_kv=False,
attn_type='longformerhand', sw_exact=0, mode=0,
out_features=None,
freeze_at=0, #detectron2
**args):
super().__init__()
self.num_classes = num_classes
if 'ln_eps' in args:
ln_eps = args['ln_eps']
self.norm_layer = partial(nn.LayerNorm, eps=ln_eps)
logging.info("Customized LayerNorm EPS: {}".format(ln_eps))
else:
self.norm_layer = norm_layer
self.drop_path_rate = drop_path_rate
self.attn_type = attn_type
self.attn_args = dict({
'attn_type': attn_type,
'qkv_bias': qkv_bias,
'qk_scale': qk_scale,
'drop': drop_rate,
'attn_drop': attn_drop_rate,
'w': w,
'd': d,
'sharew': sharew,
'only_glo': only_glo,
'share_kv': share_kv,
'sw_exact': sw_exact,
'norm_layer': norm_layer,
'mode': mode,
})
self.patch_embed_args = dict({
'norm_layer': norm_layer,
'norm_embed': norm_embed,
'drop_rate': drop_rate,
})
self.mlp_args = dict({
'mlp_ratio': 4.0,
'norm_layer': norm_layer,
'act_layer': nn.GELU,
'drop': drop_rate,
})
# Attributes for maskrcnn
assert out_features, "out_features is empty!"
self._out_feature_strides = []
self._out_feature_channels = []
self._out_features = out_features
self.frozen_stages = freeze_at
self.layer_cfgs = [parse_arch(layer) for layer in arch.split('_')]
self.num_layers = len(self.layer_cfgs)
self.depth = sum([cfg['n'] for cfg in self.layer_cfgs])
self.out_planes = self.layer_cfgs[-1]['d']
self.Nglos = [cfg['g'] for cfg in self.layer_cfgs]
self.avg_pool = args['avg_pool'] if 'avg_pool' in args else False
# ensure divisibility
stride = 1
down_strides = []
for cfg in self.layer_cfgs:
stride *= cfg['p']
down_strides.append(stride)
self._size_divisibility = stride
self.Nx = (img_size + (stride - 1)) // stride * stride
self.Ny = (img_size + (stride - 1)) // stride * stride
dprs = torch.linspace(0, drop_path_rate, self.depth).split(
[cfg['n'] for cfg in self.layer_cfgs]
) # stochastic depth decay rule
self.layer1 = self._make_layer(in_chans, self.layer_cfgs[0],
dprs=dprs[0], layerid=1)
if "layer1" in self._out_features:
self._out_feature_strides.append(down_strides[0])
self._out_feature_channels.append(self.layer_cfgs[0]['d'])
self.layer2 = self._make_layer(self.layer_cfgs[0]['d'],
self.layer_cfgs[1], dprs=dprs[1],
layerid=2)
if "layer2" in self._out_features:
self._out_feature_strides.append(down_strides[1])
self._out_feature_channels.append(self.layer_cfgs[1]['d'])
self.layer3 = self._make_layer(self.layer_cfgs[1]['d'],
self.layer_cfgs[2], dprs=dprs[2],
layerid=3)
if "layer3" in self._out_features:
self._out_feature_strides.append(down_strides[2])
self._out_feature_channels.append(self.layer_cfgs[2]['d'])
if self.num_layers == 3:
self.layer4 = None
elif self.num_layers == 4:
self.layer4 = self._make_layer(self.layer_cfgs[2]['d'],
self.layer_cfgs[3], dprs=dprs[3],
layerid=4)
if "layer4" in self._out_features:
self._out_feature_strides.append(down_strides[3])
self._out_feature_channels.append(self.layer_cfgs[3]['d'])
else:
raise ValueError("Numer of layers {} not implemented yet!".format(self.num_layers))
assert self._size_divisibility==stride, "Some stride down layer has been ignored!"
self.apply(self._init_weights)
def _freeze_stages(self):
if self.frozen_stages <= 0:
return
if self.frozen_stages >= 1:
# froze the first patch embeding layer
self.layer1[0].eval()
for param in self.layer1[0].parameters():
param.requires_grad = False
if self.frozen_stages >= 2:
# froze layer1 to layer{frozen_stages-1}
for i in range(1, self.frozen_stages):
m = getattr(self, "layer" + str(i))
m.eval()
for param in m.parameters():
param.requires_grad = False
def train(self, mode=True):
"""Convert the model into training mode while keep layers freezed."""
super(MsViT, self).train(mode)
self._freeze_stages()
def reset_vil_mode(self, mode):
longformer_attentions = find_modules(self, Long2DSCSelfAttention)
for longformer_attention in longformer_attentions:
mode_old = longformer_attention.mode
if mode_old != mode:
longformer_attention.mode = mode
logging.info(
"Change vil attention mode from {} to {} in " "layer {}"
.format(mode_old, mode, longformer_attention))
return
@property
def size_divisibility(self):
return self._size_divisibility
def _make_layer(self, in_dim, layer_cfg, dprs, layerid=0):
layer_id, num_heads, dim, num_block, is_sparse_attn, nglo, patch_size, num_feats, ape \
= layer_cfg['l'], layer_cfg['h'], layer_cfg['d'], layer_cfg['n'], \
layer_cfg['s'], layer_cfg['g'], layer_cfg['p'], layer_cfg['f'], \
layer_cfg['a']
assert layerid == layer_id, "Error in _make_layer: layerid {} does not equal to layer_id {}".format(layerid, layer_id)
self.Nx = nx = self.Nx // patch_size
self.Ny = ny = self.Ny // patch_size
seq_len = nx * ny + nglo
self.attn_args['nglo'] = nglo
self.patch_embed_args['nglo'] = nglo
self.attn_args['num_feats'] = num_feats # shared for linformer and performer
self.attn_args['rratio'] = num_feats # srformer reuses this parameter
self.attn_args['w'] = num_feats # longformer reuses this parameter
if is_sparse_attn == 0:
self.attn_args['attn_type'] = 'full'
# patch embedding
layers = [
PatchEmbed(patch_size, nx, ny, in_chans=in_dim, embed_dim=dim, ape=ape,
**self.patch_embed_args)
]
for dpr in dprs:
layers.append(AttnBlock(
dim, num_heads, drop_path=dpr, seq_len=seq_len, rpe=not ape,
wx=nx, wy=ny,
**self.attn_args
))
layers.append(MlpBlock(dim, drop_path=dpr, **self.mlp_args))
return nn.Sequential(*layers)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay(self):
no_decay = {'pos_embed', 'cls_token',
'norm.weight', 'norm.bias',
'norm_embed', 'head.bias',
'relative_position'}
return no_decay
def get_classifier(self):
return self.head
def forward(self, x):
B = x.shape[0]
outputs = []
x, nx, ny = self.layer1((x, None, None))
if "layer1" in self._out_features:
outputs.append(
x[:, self.Nglos[0]:].transpose(-2, -1).reshape(B, -1, nx, ny)
)
x = x[:, self.Nglos[0]:].transpose(-2, -1).reshape(B, -1, nx, ny)
x, nx, ny = self.layer2((x, nx, ny))
if "layer2" in self._out_features:
outputs.append(
x[:, self.Nglos[1]:].transpose(-2, -1).reshape(B, -1, nx, ny)
)
x = x[:, self.Nglos[1]:].transpose(-2, -1).reshape(B, -1, nx, ny)
x, nx, ny = self.layer3((x, nx, ny))
if "layer3" in self._out_features:
outputs.append(
x[:, self.Nglos[2]:].transpose(-2, -1).reshape(B, -1, nx, ny)
)
if self.layer4 is not None:
x = x[:, self.Nglos[2]:].transpose(-2, -1).reshape(B, -1, nx, ny)
x, nx, ny = self.layer4((x, nx, ny))
if "layer4" in self._out_features:
outputs.append(
x[:, self.Nglos[3]:].transpose(-2, -1).reshape(B, -1, nx, ny)
)
return outputs
def build_msvit_backbone(cfg):
args = dict(
img_size=cfg.INPUT.MAX_SIZE_TRAIN,
drop_rate=cfg.MODEL.TRANSFORMER.DROP,
drop_path_rate=cfg.MODEL.TRANSFORMER.DROP_PATH,
norm_embed=cfg.MODEL.TRANSFORMER.NORM_EMBED,
avg_pool=cfg.MODEL.TRANSFORMER.AVG_POOL,
freeze_at=cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT,
out_features=cfg.MODEL.TRANSFORMER.OUT_FEATURES
)
args['arch'] = cfg.MODEL.TRANSFORMER.MSVIT.ARCH
args['sharew'] = cfg.MODEL.TRANSFORMER.MSVIT.SHARE_W
args['attn_type'] = cfg.MODEL.TRANSFORMER.MSVIT.ATTN_TYPE
args['share_kv'] = cfg.MODEL.TRANSFORMER.MSVIT.SHARE_KV
args['only_glo'] = cfg.MODEL.TRANSFORMER.MSVIT.ONLY_GLOBAL
args['sw_exact'] = cfg.MODEL.TRANSFORMER.MSVIT.SW_EXACT
args['ln_eps'] = cfg.MODEL.TRANSFORMER.MSVIT.LN_EPS
args['mode'] = cfg.MODEL.TRANSFORMER.MSVIT.MODE
return MsViT(**args)
class ViTHead(nn.Module):
def __init__(
self,
in_dim, layer_cfgstr, input_size=14,
qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
norm_embed=False, **args
):
super(ViTHead, self).__init__()
if 'ln_eps' in args:
ln_eps = args['ln_eps']
self.norm_layer = partial(nn.LayerNorm, eps=ln_eps)
logging.info("Customized LayerNorm EPS: {}".format(ln_eps))
else:
self.norm_layer = norm_layer
self.drop_path_rate = drop_path_rate
self.attn_args = dict({
'attn_type': 'full', # full attention for head
'qkv_bias': qkv_bias,
'qk_scale': qk_scale,
'drop': drop_rate,
'attn_drop': attn_drop_rate,
'norm_layer': norm_layer,
'drop_path': drop_path_rate,
})
self.patch_embed_args = dict({
'norm_layer': norm_layer,
'norm_embed': norm_embed,
'drop_rate': drop_rate,
})
self.mlp_args = dict({
'mlp_ratio': 4.0,
'norm_layer': norm_layer,
'act_layer': nn.GELU,
'drop': drop_rate,
'drop_path': drop_path_rate,
})
layer_cfg = parse_arch(layer_cfgstr)
layer_id, num_heads, dim, num_block, is_sparse_attn, nglo, patch_size, num_feats, ape \
= layer_cfg['l'], layer_cfg['h'], layer_cfg['d'], layer_cfg['n'], \
layer_cfg['s'], layer_cfg['g'], layer_cfg['p'], layer_cfg['f'], \
layer_cfg['a']
self.input_size = input_size
self.nglo = nglo
assert input_size%patch_size == 0, "Input size is not divided by patch size in ViTHead!"
assert nglo == 0, "Number of global tokens in ViTHead is not 0!"
nx = self.input_size // patch_size
ny = self.input_size // patch_size
seq_len = nx * ny + nglo
# patch embedding
layers = [
PatchEmbed(patch_size, nx, ny, in_chans=in_dim, embed_dim=dim,
ape=ape, nglo=nglo, **self.patch_embed_args)
]
for block_id in range(num_block):
layers.append(AttnBlock(
dim, num_heads, seq_len=seq_len, rpe=not ape,
wx=nx, wy=ny, nglo=nglo,
**self.attn_args
))
layers.append(MlpBlock(dim, **self.mlp_args))
self.layer4 = nn.Sequential(*layers)
self.norm = norm_layer(dim)
self.out_channels = dim
def forward(self, x):
B, C, nx, ny = x.shape
assert nx == ny == self.input_size, "Input size does not match the initialized size in ViThead!"
nglo = self.nglo
x, nx, ny = self.layer4((x, None, None))
x = self.norm(x)
x = x[:, nglo:].transpose(-2, -1).reshape(B, -1, nx, ny)
return x

Просмотреть файл

@ -0,0 +1,366 @@
# Copyright (c) 2021 Microsoft Corporation. Licensed under the MIT license.
# Written by Pengchuan Zhang, penzhan@microsoft.com
from functools import lru_cache
import torch
from torch import einsum
from torch.cuda.amp import autocast
class SlidingChunk2D(torch.autograd.Function):
"""
Class to encapsulate for sliding chunk implementation of vision longformer
"""
mode_dict = {
1: (1, 1), # -1, -1
2: (1, 0), # -1, 0
3: (1, -1), # -1, 1
4: (0, 1), # 0, -1
5: (0, -1), # 0, 1
6: (-1, 1), # 1, -1
7: (-1, 0), # 1, 0
8: (-1, -1), # 1, 1
}
@staticmethod
def slidingchunk_qk(q_img: torch.Tensor, k_img: torch.Tensor, mode: int):
'''
q_img x k_img = attn11 ==> Useful for query x key = attention_scores
The cyclic padding strategy
q_img, k_img: (B * H, M, mx, my, W**2)
attn11 (B*H, mx, my, W**2, 9*W**2), mode=0
(B*H, mx, my, W**2, W**2), mode=-1
(B*H, mx, my, W**2, 2*W**2), mode=i>0
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
'''
if mode == 0:
return torch.cat([
# -1, -1
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=(1, 1), dims=(2, 3))),
# -1, 0
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=1, dims=2)),
# -1, 1
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=(1, -1), dims=(2, 3))),
# 0, -1
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=1, dims=3)),
# 0, 0
einsum('b c m n l, b c m n t -> b m n l t', q_img,
k_img),
# 0, 1
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=-1, dims=3)),
# 1, -1
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=(-1, 1), dims=(2, 3))),
# 1, 0
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=-1, dims=2)),
# 1, 1
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=(-1, -1), dims=(2, 3))),
], dim=-1)
elif mode == -1:
return einsum(
'b c m n l, b c m n t -> b m n l t', q_img, k_img
) * 1.0
else:
shift = SlidingChunk2D.mode_dict[mode]
return torch.cat([
# 0, 0
einsum('b c m n l, b c m n t -> b m n l t', q_img, k_img),
# x, x
einsum('b c m n l, b c m n t -> b m n l t', q_img,
torch.roll(k_img, shifts=shift, dims=(2, 3))),
], dim=-1)
@staticmethod
def slidingchunk_av(attn: torch.Tensor, v_img: torch.Tensor, mode: int):
'''
attn x v_img = x ==> Useful for attn x value = context
The cyclic padding strategy
v_img, context: (B * H, M, mx, my, W**2)
attn (B*H, mx, my, W**2, 9*W**2), mode=0
(B*H, mx, my, W**2, W**2), mode=-1
(B*H, mx, my, W**2, 2*W**2), mode=i>0
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
'''
w2 = v_img.shape[-1]
if mode == 0:
attnn1n1, attnn10, attnn11, attn0n1, attn00, attn01, attn1n1, attn10, attn11 = torch.split(
attn, w2, dim=-1
)
elif mode == -1:
attn00 = attn
else:
attn00, attnxx = torch.split(
attn, w2, dim=-1
)
output = einsum('b m n l t, b c m n t -> b c m n l', attn00, v_img) # 0,0
if mode == 0:
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn1n1,
torch.roll(v_img, shifts=(1, 1), dims=(2, 3))) # -1,-1
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn10,
torch.roll(v_img, shifts=1, dims=2)) # -1,0
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnn11,
torch.roll(v_img, shifts=(1, -1), dims=(2, 3))) # -1,1
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn0n1,
torch.roll(v_img, shifts=1, dims=3)) # 0,-1
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn01,
torch.roll(v_img, shifts=-1, dims=3)) # 0,1
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn1n1,
torch.roll(v_img, shifts=(-1, 1), dims=(2, 3))) # 1,-1
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn10,
torch.roll(v_img, shifts=-1, dims=2)) # 1,0
output = output + einsum('b m n l t, b c m n t -> b c m n l', attn11,
torch.roll(v_img, shifts=(-1, -1), dims=(2, 3))) # 1,1
elif mode > 0:
shift = SlidingChunk2D.mode_dict[mode]
output = output + einsum('b m n l t, b c m n t -> b c m n l', attnxx,
torch.roll(v_img, shifts=shift, dims=(2, 3))) # 1,1
else:
output = output * 1.0
return output
@staticmethod
def slidingchunk_agrad(attn: torch.Tensor, grad_x: torch.Tensor, mode: int):
'''
attn.t() x grad_x = grad_v ==> Useful for attn.t() x grad_x = grad_v
The cyclic padding strategy
grad_x, grad_v: (B * H, M, mx, my, W**2)
attn (B*H, mx, my, W**2, 9*W**2), mode=0
(B*H, mx, my, W**2, W**2), mode=-1
(B*H, mx, my, W**2, 2*W**2), mode=i>0
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
'''
w2 = grad_x.shape[-1]
if mode == 0:
attnn1n1, attnn10, attnn11, attn0n1, attn00, attn01, attn1n1, attn10, attn11 = torch.split(
attn, w2, dim=-1
)
elif mode == -1:
attn00 = attn
else:
attn00, attnxx = torch.split(
attn, w2, dim=-1
)
# 0,0
output = einsum('b m n l t, b c m n l -> b c m n t', attn00, grad_x)
if mode == 0:
# -1,-1
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attnn1n1, grad_x),
shifts=(-1, -1), dims=(2, 3))
# -1,0
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attnn10, grad_x),
shifts=-1, dims=2)
# -1,1
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attnn11, grad_x),
shifts=(-1, 1), dims=(2, 3))
# 0,-1
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attn0n1, grad_x),
shifts=-1, dims=3)
# 0,1
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attn01, grad_x),
shifts=1, dims=3)
# 1,-1
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attn1n1, grad_x),
shifts=(1, -1), dims=(2, 3))
# 1,0
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attn10, grad_x),
shifts=1, dims=2)
# 1,1
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attn11, grad_x),
shifts=(1, 1), dims=(2, 3))
elif mode > 0:
shift = SlidingChunk2D.mode_dict[mode]
shift = (-shift[0], -shift[1])
output = output + torch.roll(
einsum('b m n l t, b c m n l -> b c m n t', attnxx, grad_x),
shifts=shift, dims=(2, 3))
else:
output = output * 1.0
return output
@staticmethod
@autocast() # comment this out if AMP is not used
def forward(ctx, t1: torch.Tensor, t2: torch.Tensor,
is_t1_diagonaled: bool = False, mode: int = 0) -> torch.Tensor:
"""Compuates sliding chunk mm of t1 and t2.
args:
t1: torch.Tensor = (B * H, M, mx, my, W**2) if is_t1_diagonaled = false,
= (B*H, mx, my, W**2, 9*W**2) if is_t1_diagonaled = true, mode=0.
= (B*H, mx, my, W**2, W**2) if is_t1_diagonaled = true, mode=-1.
= (B*H, mx, my, W**2, 2*W**2) if is_t1_diagonaled = true, mode=i>0.
t2: torch.Tensor = (B * H, M, mx, my, W**2). This is always a
non-diagonaled tensor, e.g. `key_layer` or `value_layer`
is_t1_diagonaled: is t1 a diagonaled or a regular tensor
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
returns:
is_t1_diagonaled = true:
torch.Tensor = (B * H, M, mx, my, W**2)
mode=0, is_t1_diagonaled = false:
torch.Tensor = (B*H, mx, my, W**2, 9*W**2)
mode=-1, is_t1_diagonaled = false:
torch.Tensor = (B*H, mx, my, W**2, W**2)
mode=i>0, is_t1_diagonaled = false:
torch.Tensor = (B*H, mx, my, W**2, W**2)
"""
ctx.save_for_backward(t1, t2)
ctx.is_t1_diagonaled = is_t1_diagonaled
ctx.mode = mode
if is_t1_diagonaled:
return SlidingChunk2D.slidingchunk_av(t1, t2, mode)
else:
return SlidingChunk2D.slidingchunk_qk(t1, t2, mode)
@staticmethod
@autocast() # comment this out if AMP is not used
def backward(ctx, grad_output):
t1, t2 = ctx.saved_tensors
is_t1_diagonaled = ctx.is_t1_diagonaled
mode = ctx.mode
if is_t1_diagonaled:
grad_t1 = SlidingChunk2D.slidingchunk_qk(grad_output, t2, mode)
grad_t2 = SlidingChunk2D.slidingchunk_agrad(t1, grad_output, mode)
else:
grad_t1 = SlidingChunk2D.slidingchunk_av(grad_output, t2, mode)
grad_t2 = SlidingChunk2D.slidingchunk_agrad(grad_output, t1, mode)
return grad_t1, grad_t2, None, None
@lru_cache()
def _get_invalid_locations_mask_cyclic(nx: int, ny: int, padx: int, pady: int,
w: int, device: str):
w2 = w ** 2
mask = torch.BoolTensor([
[
(i // ny + (j // w2) // 3 == nx and
(nx - 1) * w + (j % w2) // w >= nx * w - padx) or
(i % ny + (j // w2) % 3 == ny and
(ny - 1) * w + (j % w2) % w >= ny * w - pady)
for j in range(9 * w2)
]
for i in range(nx * ny)
], device='cpu')
# We should count the w2 in the query here
num_invalid = w2 * mask.sum()
return mask.to(device), num_invalid.to(device)
@lru_cache()
def _get_invalid_locations_mask_zero(nx: int, ny: int, padx: int, pady: int,
w: int, device: str):
w2 = w ** 2
mask = torch.BoolTensor([
[
i // ny + (j // w2) // 3 - 1 < 0 or
i // ny + (j // w2) // 3 - 1 >= nx or
(i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w >= nx * w - padx or
i % ny + (j // w2) % 3 - 1 < 0 or
i % ny + (j // w2) % 3 - 1 >= ny or
(i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w >= ny * w - pady
for j in range(9 * w2)
]
for i in range(nx * ny)
], device='cpu')
# We should count the w2 in the query here
num_invalid = w2 * mask.sum()
return mask.to(device), num_invalid.to(device)
@lru_cache()
def _get_invalid_locations_mask_exact(nx: int, ny: int, padx: int, pady: int,
w: int, device: str):
w2 = w ** 2
nx_max = nx * w - 1 - padx
ny_max = ny * w - 1 - pady
mask = torch.BoolTensor([
[
[
(i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w < max(0, (
i // ny - 1) * w + l // w) or
(i // ny + (j // w2) // 3 - 1) * w + (j % w2) // w > min(
nx_max, (i // ny + 1) * w + l // w) or
(i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w < max(0, (
i % ny - 1) * w + l % w) or
(i % ny + (j // w2) % 3 - 1) * w + (j % w2) % w > min(
ny_max, (i % ny + 1) * w + l % w)
for j in range(9 * w2)
]
for l in range(w2)
]
for i in range(nx * ny)
], device='cpu')
num_invalid = mask.sum()
return mask.to(device), num_invalid.to(device)
def mask_invalid_locations(input_tensor: torch.Tensor, nx: int, ny: int,
padx: int, pady: int, w: int,
exact: int, mode: int = 0) -> torch.Tensor:
"""exact
1: exact sliding window
0: blockwise sliding chunk with zero padding
-1: blockwise sliding chunk with cyclic padding
mode: 0 -> full, -1 -> only self, i (>0) -> self+block_i
"""
w2 = w ** 2
if exact == 1 and mode == 0:
mask, num_invalid = _get_invalid_locations_mask_exact(
nx, ny, padx, pady, w, input_tensor.device)
mask = mask.view(1, nx, ny, w2, -1).expand(input_tensor.size())
else:
if exact == 0:
mask, num_invalid = _get_invalid_locations_mask_zero(
nx, ny, padx, pady, w, input_tensor.device)
elif exact == -1:
mask, num_invalid = _get_invalid_locations_mask_cyclic(
nx, ny, padx, pady, w, input_tensor.device)
else:
raise ValueError("longsc exact should be in [0,1,-1]!")
if mode == -1:
mask = mask[:, 4 * w2:5 * w2]
num_invalid = w2 * mask.sum()
elif mode > 0:
chunk_id = mode if mode > 4 else mode - 1
mask = torch.cat([
mask[:, 4 * w2:5 * w2],
mask[:, chunk_id * w2:(chunk_id+1) * w2],
], dim=-1)
num_invalid = w2 * mask.sum()
mask = mask.view(1, nx, ny, 1, -1).expand(input_tensor.size())
input_tensor.masked_fill_(mask, -float('inf'))
return num_invalid
def slidingchunk_2dautograd(t1: torch.Tensor, t2: torch.Tensor,
is_t1_diagonaled: bool = False, mode: int = 0) -> torch.Tensor:
if is_t1_diagonaled:
return SlidingChunk2D.slidingchunk_av(t1, t2, mode)
else:
return SlidingChunk2D.slidingchunk_qk(t1, t2, mode)
slidingchunk_2d = SlidingChunk2D.apply

Просмотреть файл

@ -35,8 +35,8 @@ class BalancedPositiveNegativeSampler(object):
pos_idx = []
neg_idx = []
for matched_idxs_per_image in matched_idxs:
positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
positive = torch.nonzero(matched_idxs_per_image >= 1, as_tuple=False).squeeze(1)
negative = torch.nonzero(matched_idxs_per_image == 0, as_tuple=False).squeeze(1)
num_pos = int(self.batch_size_per_image * self.positive_fraction)
# protect against not enough positive examples

Просмотреть файл

@ -6,6 +6,7 @@ Implements the Generalized R-CNN framework
import torch
from torch import nn
from maskrcnn_benchmark.structures.bounding_box import BoxList
from maskrcnn_benchmark.structures.image_list import to_image_list
from ..backbone import build_backbone
@ -29,6 +30,7 @@ class GeneralizedRCNN(nn.Module):
self.backbone = build_backbone(cfg)
self.rpn = build_rpn(cfg, self.backbone.out_channels)
self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)
self.force_boxes = cfg.MODEL.RPN.FORCE_BOXES
def forward(self, images, targets=None):
"""
@ -45,9 +47,30 @@ class GeneralizedRCNN(nn.Module):
"""
if self.training and targets is None:
raise ValueError("In training mode, targets should be passed")
if self.force_boxes and targets is None:
# note targets cannot be None but could have 0 box.
raise ValueError("In force_boxes setting, targets should be passed")
images = to_image_list(images)
features = self.backbone(images.tensors)
proposals, proposal_losses = self.rpn(images, features, targets)
if targets:
targets = [target.to(self.device)
for target in targets if target is not None]
if self.force_boxes:
proposals = [BoxList(target.bbox, target.size, target.mode)
for target in targets]
if self.training:
# note we still need to compute a loss using all rpn
# named parameters, otherwise it will
# give unused_parameters error in distributed training.
null_loss = 0
for key, param in self.rpn.named_parameters():
null_loss += 0.0 * param.sum()
proposal_losses = {'rpn_null_loss', null_loss}
else:
proposals, proposal_losses = self.rpn(images, features, targets)
if self.roi_heads:
x, result, detector_losses = self.roi_heads(features, proposals, targets)
else:

Просмотреть файл

@ -101,7 +101,7 @@ class Matcher(object):
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find highest quality match available, even if it is low, including ties
gt_pred_pairs_of_highest_quality = torch.nonzero(
match_quality_matrix == highest_quality_foreach_gt[:, None]
match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=False
)
# Example gt_pred_pairs_of_highest_quality:
# tensor([[ 0, 39796],

Просмотреть файл

@ -114,7 +114,7 @@ class Pooler(nn.Module):
device=device,
)
for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)):
idx_in_level = torch.nonzero(levels == level).squeeze(1)
idx_in_level = torch.nonzero(levels == level, as_tuple=False).squeeze(1)
rois_per_level = rois[idx_in_level]
result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype)

Просмотреть файл

@ -201,7 +201,7 @@ class PostProcessor(nn.Module):
inds_all = scores > self.score_thresh
boxlist_empty = self.prepare_empty_boxlist(boxlist)
for j in range(1, num_classes):
inds = inds_all[:, j].nonzero().squeeze(1)
inds = inds_all[:, j].nonzero(as_tuple=False).squeeze(1)
if len(inds)>0:
scores_j = scores[inds, j]
@ -239,7 +239,7 @@ class PostProcessor(nn.Module):
cls_scores.cpu(), number_of_detections - self.detections_per_img + 1
)
keep = cls_scores >= image_thresh.item()
keep = torch.nonzero(keep).squeeze(1)
keep = torch.nonzero(keep, as_tuple=False).squeeze(1)
result = result[keep]
return result
@ -273,7 +273,7 @@ class PostProcessor(nn.Module):
# filter duplicate boxes
scores_pre, labels_pre = dists_all.max(1)
inds_pre = scores_pre.nonzero()
inds_pre = scores_pre.nonzero(as_tuple=False)
assert inds_pre.dim() != 0
inds_pre = inds_pre.squeeze(1)
@ -331,7 +331,7 @@ class PostProcessor(nn.Module):
hs = (y2 - y1).squeeze(1)
keep = (
(ws >= 0) & (hs >= 0) & (scores > self.score_thresh * 0.01)
).nonzero().squeeze(1)
).nonzero(as_tuple=False).squeeze(1)
del ws, hs
# apply nms to the previous low-thresholded results

Просмотреть файл

@ -118,7 +118,7 @@ class FastRCNNLossComputation(object):
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
zip(sampled_pos_inds, sampled_neg_inds)
):
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img, as_tuple=False).squeeze(1)
proposals_per_image = proposals[img_idx][img_sampled_inds]
proposals[img_idx] = proposals_per_image
@ -182,7 +182,7 @@ class FastRCNNLossComputation(object):
# get indices that correspond to the regression targets for
# the corresponding ground truth labels, to be used with
# advanced indexing
sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)
sampled_pos_inds_subset = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
labels_pos = labels[sampled_pos_inds_subset]
if self.cls_agnostic_bbox_reg:
map_inds = torch.tensor([4, 5, 6, 7], device=device)

Просмотреть файл

@ -9,6 +9,7 @@ from maskrcnn_benchmark.modeling.backbone import resnet
from maskrcnn_benchmark.modeling.poolers import Pooler
from maskrcnn_benchmark.modeling.make_layers import group_norm
from maskrcnn_benchmark.modeling.make_layers import make_fc
from maskrcnn_benchmark.modeling.backbone.msvit import ViTHead
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor")
@ -158,6 +159,41 @@ class FPNXconv1fcFeatureExtractor(nn.Module):
return x
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ViTHeadFeatureExtractor")
class ViTHeadFeatureExtractor(nn.Module):
def __init__(self, config, in_channels):
super(ViTHeadFeatureExtractor, self).__init__()
resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES
sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
pooler = Pooler(
output_size=(resolution, resolution),
scales=scales,
sampling_ratio=sampling_ratio,
)
# VIT head
args = dict(
input_size=config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION,
drop_rate=config.MODEL.TRANSFORMER.DROP,
drop_path_rate=config.MODEL.TRANSFORMER.DROP_PATH,
norm_embed=config.MODEL.TRANSFORMER.NORM_EMBED,
layer_cfgstr=config.MODEL.TRANSFORMER.VITHEADARCH,
ln_eps=config.MODEL.TRANSFORMER.MSVIT.LN_EPS,
)
head = ViTHead(in_dim=in_channels, **args)
self.pooler = pooler
self.head = head
self.out_channels = head.out_channels
def forward(self, x, proposals):
x = self.pooler(x, proposals)
x = self.head(x)
return x
def make_roi_box_feature_extractor(cfg, in_channels):
func = registry.ROI_BOX_FEATURE_EXTRACTORS[
cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR

Просмотреть файл

@ -135,7 +135,7 @@ class KeypointRCNNLossComputation(object):
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
zip(sampled_pos_inds, sampled_neg_inds)
):
img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1)
img_sampled_inds = torch.nonzero(pos_inds_img, as_tuple=False).squeeze(1)
proposals_per_image = proposals[img_idx][img_sampled_inds]
proposals[img_idx] = proposals_per_image
@ -155,7 +155,7 @@ class KeypointRCNNLossComputation(object):
keypoint_targets = cat(heatmaps, dim=0)
valid = cat(valid, dim=0).to(dtype=torch.bool)
valid = torch.nonzero(valid).squeeze(1)
valid = torch.nonzero(valid, as_tuple=False).squeeze(1)
# torch.mean (in binary_cross_entropy_with_logits) does'nt
# accept empty tensors, so handle it sepaartely

Просмотреть файл

@ -83,7 +83,7 @@ class MaskRCNNLossComputation(object):
labels_per_image[neg_inds] = 0
# mask scores are only computed on positive samples
positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1)
positive_inds = torch.nonzero(labels_per_image > 0, as_tuple=False).squeeze(1)
segmentation_masks = matched_targets.get_field("masks")
segmentation_masks = segmentation_masks[positive_inds]
@ -114,7 +114,7 @@ class MaskRCNNLossComputation(object):
labels = cat(labels, dim=0)
mask_targets = cat(mask_targets, dim=0)
positive_inds = torch.nonzero(labels > 0).squeeze(1)
positive_inds = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
labels_pos = labels[positive_inds]
# torch.mean (in binary_cross_entropy_with_logits) doesn't

Просмотреть файл

@ -27,7 +27,7 @@ def keep_only_positive_boxes(boxes):
for boxes_per_image in boxes:
labels = boxes_per_image.get_field("labels")
inds_mask = labels > 0
inds = inds_mask.nonzero().squeeze(1)
inds = inds_mask.nonzero(as_tuple=False).squeeze(1)
positive_boxes.append(boxes_per_image[inds])
positive_inds.append(inds_mask)
return positive_boxes, positive_inds

Просмотреть файл

@ -104,8 +104,8 @@ class RPNLossComputation(object):
anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
labels, regression_targets = self.prepare_targets(anchors, targets)
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0), as_tuple=False).squeeze(1)
sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0), as_tuple=False).squeeze(1)
sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)

Просмотреть файл

@ -103,7 +103,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
per_box_cls.topk(per_pre_nms_top_n, sorted=False)
per_candidate_nonzeros = \
per_candidate_inds.nonzero()[top_k_indices, :]
per_candidate_inds.nonzero(as_tuple=False)[top_k_indices, :]
per_box_loc = per_candidate_nonzeros[:, 0]
per_class = per_candidate_nonzeros[:, 1]
@ -138,7 +138,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
result = []
# skip the background
for j in range(1, self.num_classes):
inds = (labels == j).nonzero().view(-1)
inds = (labels == j).nonzero(as_tuple=False).view(-1)
scores_j = scores[inds]
boxes_j = boxes[inds, :].view(-1, 4)
@ -167,7 +167,7 @@ class RetinaNetPostProcessor(RPNPostProcessor):
number_of_detections - self.fpn_post_nms_top_n + 1
)
keep = cls_scores >= image_thresh.item()
keep = torch.nonzero(keep).squeeze(1)
keep = torch.nonzero(keep, as_tuple=False).squeeze(1)
result = result[keep]
results.append(result)
return results

Просмотреть файл

@ -61,7 +61,7 @@ class RetinaNetLossComputation(RPNLossComputation):
labels = torch.cat(labels, dim=0)
regression_targets = torch.cat(regression_targets, dim=0)
pos_inds = torch.nonzero(labels > 0).squeeze(1)
pos_inds = torch.nonzero(labels > 0, as_tuple=False).squeeze(1)
retinanet_regression_loss = smooth_l1_loss(
box_regression[pos_inds],

Просмотреть файл

@ -1,4 +1,4 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from .build import make_optimizer
from .build import make_optimizer, make_optimizer_d2
from .build import make_lr_scheduler
from .lr_scheduler import WarmupMultiStepLR

Просмотреть файл

@ -1,4 +1,5 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import itertools
import torch
from .lr_scheduler import WarmupMultiStepLR
@ -20,6 +21,65 @@ def make_optimizer(cfg, model):
return optimizer
def make_optimizer_d2(cfg, model):
# default no decay parameters for resnets
no_decay = ['bn.bias', 'bn.weight', 'bn1.bias', 'bn1.weight',
'bn2.bias', 'bn2.weight', 'bn3.bias', 'bn3.weight']
if hasattr(model.backbone.body, 'no_weight_decay'):
no_decay = list(model.backbone.body.no_weight_decay())
params = []
memo = set()
for key, value in model.named_parameters(recurse=True):
if not value.requires_grad:
continue
# Avoid duplicating parameters
if value in memo:
continue
memo.add(value)
lr = cfg.SOLVER.BASE_LR
weight_decay = cfg.SOLVER.WEIGHT_DECAY
if "bias" in key:
lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
if any(nd in key for nd in no_decay):
weight_decay = 0.0
params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
def maybe_add_full_model_gradient_clipping(optim): # optim: the optimizer class
# detectron2 doesn't have full model gradient clipping now
clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
enable = (
cfg.SOLVER.CLIP_GRADIENTS.ENABLED
and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
and clip_norm_val > 0.0
)
class FullModelGradientClippingOptimizer(optim):
def step(self, closure=None):
all_params = itertools.chain(*[x["params"] for x in self.param_groups])
torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
super().step(closure=closure)
return FullModelGradientClippingOptimizer if enable else optim
optimizer_type = cfg.SOLVER.OPTIMIZER
if optimizer_type == "SGD":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
)
elif optimizer_type == "ADAMW":
optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
params, cfg.SOLVER.BASE_LR
)
else:
raise NotImplementedError(f"no optimizer type {optimizer_type}")
return optimizer
def make_lr_scheduler(cfg, optimizer):
return WarmupMultiStepLR(
optimizer,

Просмотреть файл

@ -45,7 +45,7 @@ def remove_small_boxes(boxlist, min_size):
_, _, ws, hs = xywh_boxes.unbind(dim=1)
keep = (
(ws >= min_size) & (hs >= min_size)
).nonzero().squeeze(1)
).nonzero(as_tuple=False).squeeze(1)
return boxlist[keep]

Просмотреть файл

@ -457,7 +457,7 @@ class PolygonList(object):
# advanced indexing on a single dimension
selected_polygons = []
if isinstance(item, torch.Tensor) and item.dtype == torch.bool:
item = item.nonzero()
item = item.nonzero(as_tuple=False)
item = item.squeeze(1) if item.numel() > 0 else item
item = item.tolist()
for i in item:

Просмотреть файл

@ -0,0 +1,14 @@
from contextlib import contextmanager
@contextmanager
def nullcontext(enter_result=None, **kwargs):
yield enter_result
try:
from torch.cuda.amp import autocast, GradScaler, custom_fwd, custom_bwd
except:
print('[Warning] Library for automatic mixed precision is not found, AMP is disabled!!')
GradScaler = nullcontext
autocast = nullcontext
custom_fwd = nullcontext
custom_bwd = nullcontext

Просмотреть файл

@ -1,13 +1,38 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from collections import OrderedDict
import logging
import math
import torch
from maskrcnn_benchmark.utils.imports import import_file
def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
def resize_pos_embed_1d(posemb, shape_new):
# Rescale the grid of position embeddings when loading from state_dict.
ntok_old = posemb.shape[1]
if ntok_old > 1:
ntok_new = shape_new[1]
posemb_grid = posemb.permute(0, 2, 1).unsqueeze(dim=-1)
posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=[ntok_new, 1], mode='bilinear')
posemb_grid = posemb_grid.squeeze(dim=-1).permute(0, 2, 1)
posemb = posemb_grid
return posemb
def resize_pos_embed_2d(posemb, shape_new):
# Rescale the grid of position embeddings when loading from state_dict. Adapted from
# https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
ntok_new = shape_new[0]
gs_old = int(math.sqrt(len(posemb))) # 2 * w - 1
gs_new = int(math.sqrt(ntok_new)) # 2 * w - 1
posemb_grid = posemb.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
posemb_grid = torch.nn.functional.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear')
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(gs_new * gs_new, -1)
return posemb_grid
def align_and_update_state_dicts(model_state_dict, loaded_state_dict, skip_unmatched_layers=True):
"""
Strategy: suppose that the models that we will create will have prefixes appended
to each of its keys, for example due to an extra level of nesting that the original
@ -41,11 +66,47 @@ def align_and_update_state_dicts(model_state_dict, loaded_state_dict):
max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1
log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
logger = logging.getLogger(__name__)
# print out no match
uninitialized_keys = [current_keys[idx_new] for idx_new, idx_old in enumerate(idxs.tolist()) if idx_old == -1]
logger.info("Parameters not initialized from checkpoint: {}\n".format(
','.join(uninitialized_keys)
))
for idx_new, idx_old in enumerate(idxs.tolist()):
if idx_old == -1:
continue
key = current_keys[idx_new]
key_old = loaded_keys[idx_old]
if model_state_dict[key].shape != loaded_state_dict[
key_old].shape and skip_unmatched_layers:
if 'x_pos_embed' in key or 'y_pos_embed' in key:
shape_old = loaded_state_dict[key_old].shape
shape_new = model_state_dict[key].shape
new_val = resize_pos_embed_1d(loaded_state_dict[key_old],
shape_new)
if shape_new == new_val.shape:
model_state_dict[key] = new_val
logger.info("[RESIZE] {} {} -> {} {}".format(
key_old, shape_old, key, shape_new))
else:
logger.info("[WARNING]", "{} {} != {} {}, skip".format(
key_old, new_val.shape, key, shape_new))
elif 'local_relative_position_bias_table' in key:
shape_old = loaded_state_dict[key_old].shape
shape_new = model_state_dict[key].shape
new_val = resize_pos_embed_2d(loaded_state_dict[key_old],
shape_new)
if shape_new == new_val.shape:
model_state_dict[key] = new_val
logger.info("[RESIZE] {} {} -> {} {}".format(
key_old, shape_old, key, shape_new))
else:
logger.info("[WARNING]", "{} {} != {} {}, skip".format(
key_old, new_val.shape, key, shape_new))
else:
# if layer weights does not match in size, skip this layer
logger.info(
"SKIPPING LAYER {} because of size mis-match".format(key))
continue
model_state_dict[key] = loaded_state_dict[key_old]
logger.info(
log_str_template.format(

Просмотреть файл

@ -5,6 +5,7 @@ Implements the FRCNN with Attribute Head
import numpy as np
import torch
from maskrcnn_benchmark.structures.bounding_box import BoxList
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.modeling.detector.generalized_rcnn import \
GeneralizedRCNN
@ -56,6 +57,9 @@ class AttrRCNN(GeneralizedRCNN):
"""
if self.training and targets is None:
raise ValueError("In training mode, targets should be passed")
if self.force_boxes and targets is None:
# note targets cannot be None but could have 0 box.
raise ValueError("In force_boxes setting, targets should be passed")
images = to_image_list(images)
images = images.to(self.device)
@ -65,7 +69,20 @@ class AttrRCNN(GeneralizedRCNN):
targets = [target.to(self.device)
for target in targets if target is not None]
proposals, proposal_losses = self.rpn(images, features, targets)
if self.force_boxes:
proposals = [BoxList(target.bbox, target.size, target.mode)
for target in targets]
if self.training:
# note we still need to compute a loss using all rpn
# named parameters, otherwise it will
# give unused_parameters error in distributed training.
null_loss = 0
for key, param in self.rpn.named_parameters():
null_loss += 0.0 * param.sum()
proposal_losses = {'rpn_null_loss', null_loss}
else:
proposals, proposal_losses = self.rpn(images, features, targets)
x, predictions, detector_losses = self.roi_heads(features,
proposals, targets)

Просмотреть файл

@ -48,9 +48,9 @@ class AttributeRCNNLossComputation(object):
# prepare attribute targets
sim_attributes = attribute_logits.new(attribute_logits.size()).zero_()
for i in range(len(attributes)):
if len(torch.nonzero(attributes[i])) > 0:
sim_attributes[i][attributes[i][torch.nonzero(attributes[i])].long()] = 1.0 / len(
torch.nonzero(attributes[i]))
if len(torch.nonzero(attributes[i], as_tuple=False)) > 0:
sim_attributes[i][attributes[i][torch.nonzero(attributes[i], as_tuple=False)].long()] = 1.0 / len(
torch.nonzero(attributes[i], as_tuple=False))
# TODO: do we need to ignore the all zero vector?
attribute_loss = self.cross_entropy(attribute_logits, sim_attributes, loss_type="softmax")

Просмотреть файл

@ -15,6 +15,10 @@ registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS.register(
"FPNXconv1fcFeatureExtractor", FPNXconv1fcFeatureExtractor
)
registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS.register(
"ViTHeadFeatureExtractor", ViTHeadFeatureExtractor
)
def make_roi_attribute_feature_extractor(cfg, in_channels):
func = registry.ROI_ATTRIBUTE_FEATURE_EXTRACTORS[

Просмотреть файл

@ -34,8 +34,8 @@ class BalancedPositiveNegativePairSampler(object):
pos_idx = []
neg_idx = []
for matched_idxs_per_image in matched_idxs:
positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1)
negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1)
positive = torch.nonzero(matched_idxs_per_image >= 1, as_tuple=False).squeeze(1)
negative = torch.nonzero(matched_idxs_per_image == 0, as_tuple=False).squeeze(1)
num_pos = int(self.batch_size_per_image * self.positive_fraction)
# protect against not enough positive examples

Просмотреть файл

@ -55,7 +55,7 @@ class FastRCNNLossComputation(object):
match_j = match_quality_matrix[j].view(1, -1)
match_ij = ((match_i + match_j) / 2)
# rmeove duplicate index
non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero().view(-1).to(match_ij.device)
non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero(as_tuple=False).view(-1).to(match_ij.device)
match_ij = match_ij.view(-1) # [::match_quality_matrix.shape[1]] = 0
match_ij = match_ij[non_duplicate_idx]
temp.append(match_ij)
@ -79,7 +79,7 @@ class FastRCNNLossComputation(object):
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposal.bbox.device)
proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero()
non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False)
proposal_box_pairs = proposal_box_pairs[non_duplicate_idx.view(-1)]
proposal_idx_pairs = proposal_idx_pairs[non_duplicate_idx.view(-1)]
proposal_pairs = BoxPairList(proposal_box_pairs, proposal.size, proposal.mode)
@ -167,7 +167,7 @@ class FastRCNNLossComputation(object):
for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
zip(sampled_pos_inds, sampled_neg_inds)
):
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img, as_tuple=False).squeeze(1)
proposal_pairs_per_image = proposal_pairs[img_idx][img_sampled_inds]
proposal_pairs[img_idx] = proposal_pairs_per_image
@ -245,13 +245,13 @@ class FastRCNNLossComputation(object):
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals[0].bbox.device)
proposal_idx_pairs_per_image = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
keep_idx = (proposal_idx_pairs_per_image[:, 0] != proposal_idx_pairs_per_image[:, 1]).nonzero().view(-1)
keep_idx = (proposal_idx_pairs_per_image[:, 0] != proposal_idx_pairs_per_image[:, 1]).nonzero(as_tuple=False).view(-1)
# if we filter non overlap bounding boxes
if cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
ious = boxlist_iou(proposals[0], proposals[0]).view(-1)
ious = ious[keep_idx]
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
# proposal_idx_pairs_per_image = proposal_idx_pairs_per_image[keep_idx]
proposal_box_pairs_per_image = proposal_box_pairs_per_image[keep_idx]
proposal_box_pairs.append(proposal_box_pairs_per_image)
@ -361,7 +361,7 @@ class FastRCNNLossComputation(object):
labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
# import pdb; pdb.set_trace()
rel_fg_cnt = len(labels.nonzero())
rel_fg_cnt = len(labels.nonzero(as_tuple=False))
rel_bg_cnt = labels.shape[0] - rel_fg_cnt
ce_weights = labels.new(class_logits.size(1)).fill_(1).float()
ce_weights[0] = float(rel_fg_cnt) / (rel_bg_cnt + 1e-5)

Просмотреть файл

@ -113,7 +113,7 @@ class MSDN_BASE(nn.Module):
requires_grad=True).type_as(target_features)
feature_data.append(temp)
else:
transfer_list = (select_mat.data > 0).nonzero()
transfer_list = (select_mat.data > 0).nonzero(as_tuple=False)
source_indices = Variable(transfer_list[:, 1])
target_indices = Variable(transfer_list[:, 0])
source_f = torch.index_select(source_features, 0, source_indices)
@ -122,7 +122,7 @@ class MSDN_BASE(nn.Module):
for f_id in range(target_features.size()[0]):
if select_mat[f_id, :].data.sum() > 0:
feature_indices = (transfer_list[:, 0] == f_id).nonzero()[0]
feature_indices = (transfer_list[:, 0] == f_id).nonzero(as_tuple=False)[0]
indices = Variable(feature_indices)
features = torch.index_select(transferred_features, 0,
indices).mean(0).view(-1)

Просмотреть файл

@ -93,7 +93,7 @@ class PairMatcher(object):
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find highest quality match available, even if it is low, including ties
gt_pred_pairs_of_highest_quality = torch.nonzero(
match_quality_matrix == highest_quality_foreach_gt[:, None]
match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=False
)
# Example gt_pred_pairs_of_highest_quality:
# tensor([[ 0, 39796],

Просмотреть файл

@ -98,13 +98,13 @@ class ROIRelationHead(torch.nn.Module):
proposal_label_pairs = torch.cat(
(label_subj.view(-1, 1), label_obj.view(-1, 1)), 1)
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False).view(-1)
# if we filter non overlap bounding boxes
if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
ious = ious[keep_idx]
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
proposal_idx_pairs = proposal_idx_pairs[keep_idx]
proposal_box_pairs = proposal_box_pairs[keep_idx]
proposal_label_pairs = proposal_label_pairs[keep_idx]

Просмотреть файл

@ -46,7 +46,7 @@ class RelPN(nn.Module):
match_ij = ((match_i + match_j) / 2)
# rmeove duplicate index
match_ij = match_ij.view(-1) # [::match_quality_matrix.shape[1]] = 0
# non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero().view(-1).to(match_ij.device)
# non_duplicate_idx = (torch.eye(match_ij.shape[0]).view(-1) == 0).nonzero(as_tuple=False).view(-1).to(match_ij.device)
# match_ij = match_ij[non_duplicate_idx]
temp.append(match_ij)
boxi = target.bbox[i]; boxj = target.bbox[j]
@ -68,7 +68,7 @@ class RelPN(nn.Module):
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposal.bbox.device)
proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
# non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero()
# non_duplicate_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False)
# proposal_box_pairs = proposal_box_pairs[non_duplicate_idx.view(-1)]
# proposal_idx_pairs = proposal_idx_pairs[non_duplicate_idx.view(-1)]
@ -184,13 +184,13 @@ class RelPN(nn.Module):
idx_obj = torch.arange(box_obj.shape[0]).view(1, -1, 1).repeat(box_subj.shape[0], 1, 1).to(proposals_per_image.bbox.device)
proposal_idx_pairs = torch.cat((idx_subj.view(-1, 1), idx_obj.view(-1, 1)), 1)
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero().view(-1)
keep_idx = (proposal_idx_pairs[:, 0] != proposal_idx_pairs[:, 1]).nonzero(as_tuple=False).view(-1)
# if we filter non overlap bounding boxes
if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
ious = ious[keep_idx]
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
proposal_idx_pairs = proposal_idx_pairs[keep_idx]
proposal_box_pairs = proposal_box_pairs[keep_idx]
proposal_pairs_per_image = BoxPairList(proposal_box_pairs, proposals_per_image.size, proposals_per_image.mode)
@ -212,11 +212,11 @@ class RelPN(nn.Module):
obj_logits = proposals_per_image.get_field('scores_all')
obj_bboxes = proposals_per_image.bbox
relness = self.relationshipness(obj_logits, obj_bboxes, proposals_per_image.size)
keep_idx = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1).nonzero().view(-1)
keep_idx = (1 - torch.eye(obj_logits.shape[0]).to(relness.device)).view(-1).nonzero(as_tuple=False).view(-1)
if self.cfg.MODEL.ROI_RELATION_HEAD.FILTER_NON_OVERLAP:
ious = boxlist_iou(proposals_per_image, proposals_per_image).view(-1)
ious = ious[keep_idx]
keep_idx = keep_idx[(ious > 0).nonzero().view(-1)]
keep_idx = keep_idx[(ious > 0).nonzero(as_tuple=False).view(-1)]
relness = relness.view(-1)[keep_idx]
relness_sorted, order = torch.sort(relness.view(-1), descending=True)
@ -266,7 +266,7 @@ class RelPN(nn.Module):
proposals = self._proposal_pairs
labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
rel_fg_cnt = len(labels.nonzero())
rel_fg_cnt = len(labels.nonzero(as_tuple=False))
rel_bg_cnt = labels.shape[0] - rel_fg_cnt
ce_weights = labels.new(class_logits.size(1)).fill_(1).float()
ce_weights[0] = float(rel_fg_cnt) / (rel_bg_cnt + 1e-5)

Просмотреть файл

@ -1,6 +1,6 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
WEIGHT: "pretrained_models/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
BACKBONE:
CONV_BODY: "R-152-FPN"
RESNETS:

Просмотреть файл

@ -0,0 +1,70 @@
MODEL:
META_ARCHITECTURE: "AttrRCNN"
WEIGHT: "/mnt/model_storage/msvit/IN22kpretrained/deepbase_relative/model_best.pth"
BACKBONE:
CONV_BODY: "ViL-C4"
TRANSFORMER:
DROP: 0.0
DROP_PATH: 0.3
NORM_EMBED: True
OUT_FEATURES: ["layer3"]
VITHEADARCH: "l4,h12,d768,n1,s0,g0,p2,f7,a0"
MSVIT:
ARCH: "l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n8,s1,g1,p2,f7,a0_l3,h6,d384,n24,s1,g1,p2,f7,a0"
ATTN_TYPE: longformerhand
ONLY_GLOBAL: False
SHARE_KV: True
SHARE_W: True
SW_EXACT: 0
RPN:
PRE_NMS_TOP_N_TEST: 6000
POST_NMS_TOP_N_TEST: 300
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 384 # 512
POSITIVE_FRACTION: 0.5 # 0.25
SCORE_THRESH: 0.05 # 0.0001
DETECTIONS_PER_IMG: 100 # 600
MIN_DETECTIONS_PER_IMG: 10
ROI_BOX_HEAD:
NUM_CLASSES: 1595
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
ROI_ATTRIBUTE_HEAD:
NUM_ATTRIBUTES: 525
POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
ATTRIBUTE_ON: False
INPUT:
MIN_SIZE_TEST: 600
MAX_SIZE_TEST: 1000
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
DATASETS:
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
FACTORY_TRAIN: ("VGTSVDataset",)
FACTORY_TEST: ("VGTSVDataset",)
DATALOADER:
NUM_WORKERS: 0
SOLVER:
BASE_LR: 0.00008
WEIGHT_DECAY: 0.05
STEPS: (75000, 100000)
MAX_ITER: 170000
IMS_PER_BATCH: 1
CHECKPOINT_PERIOD: 5000
OPTIMIZER: "ADAMW"
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
IMS_PER_BATCH: 1
SKIP_PERFORMANCE_EVAL: False
SAVE_PREDICTIONS: True
SAVE_RESULTS_TO_TSV: True
TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
GATHER_ON_CPU: False
OUTPUT_DIR: "./output/vilc4_test"
DATA_DIR: "./datasets"
DISTRIBUTED_BACKEND: 'nccl'

Просмотреть файл

@ -0,0 +1,70 @@
MODEL:
META_ARCHITECTURE: "AttrRCNN"
WEIGHT: "/mnt/model_storage/msvit/IN22kpretrained/villarge_relative/model_best.pth"
BACKBONE:
CONV_BODY: "ViL-C4"
TRANSFORMER:
DROP: 0.0
DROP_PATH: 0.5
NORM_EMBED: True
OUT_FEATURES: ["layer3"]
VITHEADARCH: "l4,h24,d1536,n1,s0,g0,p2,f7,a0"
MSVIT:
ARCH: "l1,h3,d192,n1,s1,g1,p4,f7,a0_l2,h6,d384,n8,s1,g1,p2,f7,a0_l3,h12,d768,n24,s1,g1,p2,f7,a0"
ATTN_TYPE: longformerhand
ONLY_GLOBAL: False
SHARE_KV: True
SHARE_W: True
SW_EXACT: 0
RPN:
PRE_NMS_TOP_N_TEST: 6000
POST_NMS_TOP_N_TEST: 300
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 384 # 512
POSITIVE_FRACTION: 0.5 # 0.25
SCORE_THRESH: 0.05 # 0.0001
DETECTIONS_PER_IMG: 100 # 600
MIN_DETECTIONS_PER_IMG: 10
ROI_BOX_HEAD:
NUM_CLASSES: 1595
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
ROI_ATTRIBUTE_HEAD:
NUM_ATTRIBUTES: 525
POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
ATTRIBUTE_ON: False
INPUT:
MIN_SIZE_TEST: 600
MAX_SIZE_TEST: 1000
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
DATASETS:
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
FACTORY_TRAIN: ("VGTSVDataset",)
FACTORY_TEST: ("VGTSVDataset",)
DATALOADER:
NUM_WORKERS: 0
SOLVER:
BASE_LR: 0.00008
WEIGHT_DECAY: 0.05
STEPS: (75000, 100000)
MAX_ITER: 170000
IMS_PER_BATCH: 1
CHECKPOINT_PERIOD: 5000
OPTIMIZER: "ADAMW"
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
IMS_PER_BATCH: 1
SKIP_PERFORMANCE_EVAL: False
SAVE_PREDICTIONS: True
SAVE_RESULTS_TO_TSV: True
TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
GATHER_ON_CPU: False
OUTPUT_DIR: "./output/vilc4_test"
DATA_DIR: "./datasets"
DISTRIBUTED_BACKEND: 'nccl'

Просмотреть файл

@ -0,0 +1,70 @@
MODEL:
META_ARCHITECTURE: "AttrRCNN"
WEIGHT: "/mnt/model_storage/msvit/visionlongformer/longtiny1191_ape0_exact0_nglo1_mode1_swith075/model_best.pth"
BACKBONE:
CONV_BODY: "ViL-C4"
TRANSFORMER:
DROP: 0.0
DROP_PATH: 0.1
NORM_EMBED: True
OUT_FEATURES: ["layer3"]
VITHEADARCH: "l4,h12,d768,n1,s0,g0,p2,f7,a0"
MSVIT:
ARCH: "l1,h3,d96,n1,s1,g1,p4,f7,a0_l2,h3,d192,n2,s1,g1,p2,f7,a0_l3,h6,d384,n8,s1,g1,p2,f7,a0"
ATTN_TYPE: longformerhand
ONLY_GLOBAL: False
SHARE_KV: True
SHARE_W: True
SW_EXACT: 0
RPN:
PRE_NMS_TOP_N_TEST: 6000
POST_NMS_TOP_N_TEST: 300
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 384 # 512
POSITIVE_FRACTION: 0.5 # 0.25
SCORE_THRESH: 0.05 # 0.0001
DETECTIONS_PER_IMG: 100 # 600
MIN_DETECTIONS_PER_IMG: 10
ROI_BOX_HEAD:
NUM_CLASSES: 1595
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
ROI_ATTRIBUTE_HEAD:
NUM_ATTRIBUTES: 525
POSTPROCESS_ATTRIBUTES_THRESHOLD: 0.0
FEATURE_EXTRACTOR: "ViTHeadFeatureExtractor"
ATTRIBUTE_ON: False
INPUT:
MIN_SIZE_TEST: 600
MAX_SIZE_TEST: 1000
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
DATASETS:
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
FACTORY_TRAIN: ("VGTSVDataset",)
FACTORY_TEST: ("VGTSVDataset",)
DATALOADER:
NUM_WORKERS: 0
SOLVER:
BASE_LR: 0.0001
WEIGHT_DECAY: 0.05
STEPS: (75000, 100000)
MAX_ITER: 170000
IMS_PER_BATCH: 1
CHECKPOINT_PERIOD: 5000
OPTIMIZER: "ADAMW"
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
IMS_PER_BATCH: 1
SKIP_PERFORMANCE_EVAL: False
SAVE_PREDICTIONS: True
SAVE_RESULTS_TO_TSV: True
TSV_SAVE_SUBSET: ['rect', 'class', 'conf']
GATHER_ON_CPU: False
OUTPUT_DIR: "./output/vilc4_test"
DATA_DIR: "./datasets"
DISTRIBUTED_BACKEND: 'nccl'

Просмотреть файл

@ -28,9 +28,13 @@ INPUT:
MAX_SIZE_TEST: 1000
PIXEL_MEAN: [103.530, 116.280, 123.675]
DATASETS:
FACTORY_TEST: ("ODTSVDataset",)
TEST: ("flickr30k/tsv/flickr30k.yaml",)
# FACTORY_TEST: ("ODTSVDataset",)
# TEST: ("flickr30k/tsv/flickr30k.yaml",)
LABELMAP_FILE: "visualgenome/VG-SGG-dicts-vgoi6-clipped.json"
TRAIN: ("visualgenome/train_vgoi6_clipped.yaml",)
TEST: ("visualgenome/test_vgoi6_clipped.yaml",)
FACTORY_TRAIN: ("VGTSVDataset",)
FACTORY_TEST: ("VGTSVDataset",)
DATALOADER:
NUM_WORKERS: 0
SOLVER:

Просмотреть файл

@ -1,7 +1,6 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
WEIGHT: "pretrained_model/RX152FPN_reldn_oi_best.pth"
# WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
USE_FREQ_PRIOR: False
FREQ_PRIOR: "openimages_v5c/vrd/vrd_frequency_prior_include_background.npy"
BACKBONE:

Просмотреть файл

@ -1,7 +1,6 @@
MODEL:
META_ARCHITECTURE: "SceneParser"
WEIGHT: "pretrained_model/RX152FPN_reldn_oi_best.pth"
# WEIGHT: "/home/xiaothan/c/Users/xiaothan/Downloads/frcnn_x152fpn_4sets.yaml_pos0.5_lr0.005_bsz16.pth"
USE_FREQ_PRIOR: False
FREQ_PRIOR: "openimages_v5c/vrd/vrd_frequency_prior_include_background.npy"
BACKBONE:

Просмотреть файл

@ -18,12 +18,6 @@ from maskrcnn_benchmark.utils.comm import synchronize, get_rank
from maskrcnn_benchmark.utils.logger import setup_logger
from maskrcnn_benchmark.utils.miscellaneous import mkdir
# Check if we can enable mixed-precision via apex.amp
try:
from apex import amp
except ImportError:
raise ImportError('Use APEX for mixed precision via apex.amp')
def main():
parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
@ -73,10 +67,6 @@ def main():
model = build_detection_model(cfg)
model.to(cfg.MODEL.DEVICE)
# Initialize mixed-precision if necessary
use_mixed_precision = cfg.DTYPE == 'float16'
amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
output_dir = cfg.OUTPUT_DIR
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt

Просмотреть файл

@ -6,6 +6,7 @@ from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:sk
import argparse
import os
import json
import torch
from maskrcnn_benchmark.config import cfg
@ -21,11 +22,112 @@ from maskrcnn_benchmark.utils.comm import synchronize, get_rank
from maskrcnn_benchmark.utils.logger import setup_logger
from maskrcnn_benchmark.utils.miscellaneous import mkdir
# Check if we can enable mixed-precision via apex.amp
# try:
# from apex import amp
# except ImportError:
# raise ImportError('Use APEX for mixed precision via apex.amp')
def run_test(cfg, model, distributed, model_name):
if distributed and hasattr(model, 'module'):
model = model.module
torch.cuda.empty_cache() # TODO check if it helps
iou_types = ("bbox",)
if cfg.MODEL.MASK_ON:
iou_types = iou_types + ("segm",)
if cfg.MODEL.KEYPOINT_ON:
iou_types = iou_types + ("keypoints",)
output_folders = [None] * len(cfg.DATASETS.TEST)
dataset_names = cfg.DATASETS.TEST
if cfg.OUTPUT_DIR:
if len(dataset_names) == 1:
output_folder = os.path.join(
cfg.OUTPUT_DIR, "inference",
os.path.splitext(model_name)[0]
)
mkdir(output_folder)
output_folders = [output_folder]
else:
for idx, dataset_name in enumerate(dataset_names):
dataset_name1 = dataset_name.replace('/', '_')
output_folder = os.path.join(
cfg.OUTPUT_DIR, "inference",
dataset_name1,
os.path.splitext(model_name)[0]
)
mkdir(output_folder)
output_folders[idx] = output_folder
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
labelmap_file = config_dataset_file(cfg.DATA_DIR, cfg.DATASETS.LABELMAP_FILE)
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
results = inference(
model,
cfg,
data_loader_val,
dataset_name=dataset_name,
iou_types=iou_types,
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
device=cfg.MODEL.DEVICE,
expected_results=cfg.TEST.EXPECTED_RESULTS,
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
output_folder=output_folder,
skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
labelmap_file=labelmap_file,
save_predictions=cfg.TEST.SAVE_PREDICTIONS,
)
# renaming box_proposals metric to rpn_proposals if RPN_ONLY is True
if results and 'box_proposal' in results and cfg.MODEL.RPN_ONLY:
results['rpn_proposal'] = results.pop('box_proposal')
if results and output_folder:
results_path = os.path.join(output_folder, "results.json")
# checking if this file already exists and only updating tasks
# that are already present. This is useful for including
# e.g. RPN_ONLY metrics
if os.path.isfile(results_path):
with open(results_path, 'rt') as fin:
old_results = json.load(fin)
old_results.update(results)
results = old_results
with open(results_path, 'wt') as fout:
json.dump(results, fout)
synchronize()
# evaluate attribute detection
if not cfg.MODEL.RPN_ONLY and cfg.MODEL.ATTRIBUTE_ON and (not cfg.TEST.SKIP_PERFORMANCE_EVAL):
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
for output_folder, dataset_name, data_loader_val in zip(
output_folders, dataset_names, data_loaders_val
):
results_attr = inference(
model,
cfg,
data_loader_val,
dataset_name=dataset_name,
iou_types=iou_types,
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
device=cfg.MODEL.DEVICE,
expected_results=cfg.TEST.EXPECTED_RESULTS,
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
output_folder=output_folder,
skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
labelmap_file=labelmap_file,
save_predictions=cfg.TEST.SAVE_PREDICTIONS,
eval_attributes=True,
)
if results_attr and output_folder:
results_path = os.path.join(output_folder, "results.json")
# checking if this file already exists and only updating tasks
# that are already present. This is useful for including
# e.g. RPN_ONLY metrics
if os.path.isfile(results_path):
with open(results_path, 'rt') as fin:
old_results = json.load(fin)
old_results.update(results_attr)
results_attr = old_results
with open(results_path, 'wt') as fout:
json.dump(results_attr, fout)
synchronize()
def main():
@ -52,7 +154,7 @@ def main():
args = parser.parse_args()
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
distributed = num_gpus > 1
args.distributed = num_gpus > 1
cfg.set_new_allowed(True)
cfg.merge_from_other_cfg(sg_cfg)
@ -61,7 +163,7 @@ def main():
cfg.merge_from_list(args.opts)
cfg.freeze()
if distributed:
if args.distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
backend=cfg.DISTRIBUTED_BACKEND, init_method="env://"
@ -82,47 +184,13 @@ def main():
model = AttrRCNN(cfg)
model.to(cfg.MODEL.DEVICE)
# Initialize mixed-precision if necessary
# use_mixed_precision = cfg.DTYPE == 'float16'
# amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
output_dir = cfg.OUTPUT_DIR
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
ckpt = cfg.MODEL.WEIGHT if args.ckpt is None else args.ckpt
_ = checkpointer.load(ckpt, use_latest=args.ckpt is None)
model_name = os.path.basename(ckpt)
iou_types = ("bbox",)
if cfg.MODEL.MASK_ON:
iou_types = iou_types + ("segm",)
if cfg.MODEL.KEYPOINT_ON:
iou_types = iou_types + ("keypoints",)
output_folders = [None] * len(cfg.DATASETS.TEST)
dataset_names = cfg.DATASETS.TEST
if cfg.OUTPUT_DIR:
for idx, dataset_name in enumerate(dataset_names):
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
mkdir(output_folder)
output_folders[idx] = output_folder
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
labelmap_file = config_dataset_file(cfg.DATA_DIR, cfg.DATASETS.LABELMAP_FILE)
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
inference(
model,
cfg,
data_loader_val,
dataset_name=dataset_name,
iou_types=iou_types,
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
device=cfg.MODEL.DEVICE,
expected_results=cfg.TEST.EXPECTED_RESULTS,
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
output_folder=output_folder,
skip_performance_eval=cfg.TEST.SKIP_PERFORMANCE_EVAL,
labelmap_file=labelmap_file,
save_predictions=cfg.TEST.SAVE_PREDICTIONS,
)
synchronize()
run_test(cfg, model, args.distributed, model_name)
if __name__ == "__main__":

Просмотреть файл

@ -15,12 +15,14 @@ import torch
from maskrcnn_benchmark.config import cfg
from scene_graph_benchmark.config import sg_cfg
from maskrcnn_benchmark.data import make_data_loader
from maskrcnn_benchmark.data.datasets.utils.load_files import config_dataset_file
from maskrcnn_benchmark.solver import make_lr_scheduler
from maskrcnn_benchmark.solver import make_optimizer
from maskrcnn_benchmark.solver import make_optimizer, make_optimizer_d2
from maskrcnn_benchmark.engine.inference import inference
from maskrcnn_benchmark.engine.trainer import do_train
from maskrcnn_benchmark.modeling.detector import build_detection_model
from scene_graph_benchmark.scene_parser import SceneParser
from scene_graph_benchmark.AttrRCNN import AttrRCNN
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.utils.collect_env import collect_env_info
from maskrcnn_benchmark.utils.comm import synchronize, get_rank
@ -28,13 +30,7 @@ from maskrcnn_benchmark.utils.imports import import_file
from maskrcnn_benchmark.utils.logger import setup_logger
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
from maskrcnn_benchmark.utils.miscellaneous import mkdir, save_config
# See if we can use apex.DistributedDataParallel instead of the torch default,
# and enable mixed-precision via apex.amp
try:
from apex import amp
except ImportError:
raise ImportError('Use APEX for multi-precision via apex.amp')
from tools.test_sg_net import run_test
import random
import numpy as np
@ -50,23 +46,24 @@ torch.backends.cudnn.deterministic = True
def train(cfg, local_rank, distributed):
model = SceneParser(cfg)
if cfg.MODEL.META_ARCHITECTURE == "SceneParser":
model = SceneParser(cfg)
elif cfg.MODEL.META_ARCHITECTURE == "AttrRCNN":
model = AttrRCNN(cfg)
device = torch.device(cfg.MODEL.DEVICE)
model.to(device)
optimizer = make_optimizer(cfg, model)
if cfg.MODEL.BACKBONE.CONV_BODY.startswith("ViL"):
optimizer = make_optimizer_d2(cfg, model)
else:
optimizer = make_optimizer(cfg, model)
scheduler = make_lr_scheduler(cfg, optimizer)
# # Initialize mixed-precision training
# use_mixed_precision = cfg.DTYPE == "float16"
# amp_opt_level = 'O1' if use_mixed_precision else 'O0'
# model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
if distributed:
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[local_rank], output_device=local_rank,
# this should be removed if we update BatchNorm stats
broadcast_buffers=False,
broadcast_buffers=False, find_unused_parameters=True
)
arguments = {}
@ -116,39 +113,6 @@ def train(cfg, local_rank, distributed):
return model
def run_test(cfg, model, distributed):
if distributed:
model = model.module
torch.cuda.empty_cache() # TODO check if it helps
iou_types = ("bbox",)
if cfg.MODEL.MASK_ON:
iou_types = iou_types + ("segm",)
if cfg.MODEL.KEYPOINT_ON:
iou_types = iou_types + ("keypoints",)
output_folders = [None] * len(cfg.DATASETS.TEST)
dataset_names = cfg.DATASETS.TEST
if cfg.OUTPUT_DIR:
for idx, dataset_name in enumerate(dataset_names):
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
mkdir(output_folder)
output_folders[idx] = output_folder
data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
inference(
model,
data_loader_val,
dataset_name=dataset_name,
iou_types=iou_types,
box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
bbox_aug=cfg.TEST.BBOX_AUG.ENABLED,
device=cfg.MODEL.DEVICE,
expected_results=cfg.TEST.EXPECTED_RESULTS,
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
output_folder=output_folder,
)
synchronize()
def main():
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
parser.add_argument(
@ -216,7 +180,7 @@ def main():
model = train(cfg, args.local_rank, args.distributed)
if not args.skip_test:
run_test(cfg, model, args.distributed)
run_test(cfg, model, args.distributed, model_name="model_final")
if __name__ == "__main__":