import time import torch import math import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable from utils import * def build_targets(pred_corners, target, num_keypoints, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, sil_thresh, seen): nB = target.size(0) nA = num_anchors nC = num_classes conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale coord_mask = torch.zeros(nB, nA, nH, nW) cls_mask = torch.zeros(nB, nA, nH, nW) txs = list() tys = list() for i in range(num_keypoints): txs.append(torch.zeros(nB, nA, nH, nW)) tys.append(torch.zeros(nB, nA, nH, nW)) tconf = torch.zeros(nB, nA, nH, nW) tcls = torch.zeros(nB, nA, nH, nW) num_labels = 2 * num_keypoints + 3 # +2 for width, height and +1 for class within label files nAnchors = nA*nH*nW nPixels = nH*nW for b in range(nB): cur_pred_corners = pred_corners[b*nAnchors:(b+1)*nAnchors].t() cur_confs = torch.zeros(nAnchors) for t in range(50): if target[b][t*num_labels+1] == 0: break g = list() for i in range(num_keypoints): g.append(target[b][t*num_labels+2*i+1]) g.append(target[b][t*num_labels+2*i+2]) cur_gt_corners = torch.FloatTensor(g).repeat(nAnchors,1).t() # 16 x nAnchors cur_confs = torch.max(cur_confs, corner_confidences(cur_pred_corners, cur_gt_corners)).view_as(conf_mask[b]) # some irrelevant areas are filtered, in the same grid multiple anchor boxes might exceed the threshold conf_mask[b][cur_confs>sil_thresh] = 0 nGT = 0 nCorrect = 0 for b in range(nB): for t in range(50): if target[b][t*num_labels+1] == 0: break # Get gt box for the current label nGT = nGT + 1 gx = list() gy = list() gt_box = list() for i in range(num_keypoints): gt_box.extend([target[b][t*num_labels+2*i+1], target[b][t*num_labels+2*i+2]]) gx.append(target[b][t*num_labels+2*i+1] * nW) gy.append(target[b][t*num_labels+2*i+2] * nH) if i == 0: gi0 = int(gx[i]) gj0 = int(gy[i]) # Update masks best_n = 0 # 1 anchor box pred_box = pred_corners[b*nAnchors+best_n*nPixels+gj0*nW+gi0] conf = corner_confidence(gt_box, pred_box) coord_mask[b][best_n][gj0][gi0] = 1 cls_mask[b][best_n][gj0][gi0] = 1 conf_mask[b][best_n][gj0][gi0] = object_scale # Update targets for i in range(num_keypoints): txs[i][b][best_n][gj0][gi0] = gx[i]- gi0 tys[i][b][best_n][gj0][gi0] = gy[i]- gj0 tconf[b][best_n][gj0][gi0] = conf tcls[b][best_n][gj0][gi0] = target[b][t*num_labels] # Update recall during training if conf > 0.5: nCorrect = nCorrect + 1 return nGT, nCorrect, coord_mask, conf_mask, cls_mask, txs, tys, tconf, tcls class RegionLoss(nn.Module): def __init__(self, num_keypoints=9, num_classes=1, anchors=[], num_anchors=1, pretrain_num_epochs=15): # Define the loss layer super(RegionLoss, self).__init__() self.num_classes = num_classes self.num_anchors = num_anchors # for single object pose estimation, there is only 1 trivial predictor (anchor) self.num_keypoints = num_keypoints self.coord_scale = 1 self.noobject_scale = 1 self.object_scale = 5 self.class_scale = 1 self.thresh = 0.6 self.seen = 0 self.pretrain_num_epochs = pretrain_num_epochs def forward(self, output, target, epoch): # Parameters t0 = time.time() nB = output.data.size(0) nA = self.num_anchors nC = self.num_classes nH = output.data.size(2) nW = output.data.size(3) num_keypoints = self.num_keypoints # Activation output = output.view(nB, nA, (num_keypoints*2+1+nC), nH, nW) x = list() y = list() x.append(torch.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))) y.append(torch.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))) for i in range(1,num_keypoints): x.append(output.index_select(2, Variable(torch.cuda.LongTensor([2 * i + 0]))).view(nB, nA, nH, nW)) y.append(output.index_select(2, Variable(torch.cuda.LongTensor([2 * i + 1]))).view(nB, nA, nH, nW)) conf = torch.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([2 * num_keypoints]))).view(nB, nA, nH, nW)) cls = output.index_select(2, Variable(torch.linspace(2*num_keypoints+1,2*num_keypoints+1+nC-1,nC).long().cuda())) cls = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(nB*nA*nH*nW, nC) t1 = time.time() # Create pred boxes pred_corners = torch.cuda.FloatTensor(2*num_keypoints, nB*nA*nH*nW) grid_x = torch.linspace(0, nW-1, nW).repeat(nH,1).repeat(nB*nA, 1, 1).view(nB*nA*nH*nW).cuda() grid_y = torch.linspace(0, nH-1, nH).repeat(nW,1).t().repeat(nB*nA, 1, 1).view(nB*nA*nH*nW).cuda() for i in range(num_keypoints): pred_corners[2 * i + 0] = (x[i].data.view_as(grid_x) + grid_x) / nW pred_corners[2 * i + 1] = (y[i].data.view_as(grid_y) + grid_y) / nH gpu_matrix = pred_corners.transpose(0,1).contiguous().view(-1,2*num_keypoints) pred_corners = convert2cpu(gpu_matrix) t2 = time.time() # Build targets nGT, nCorrect, coord_mask, conf_mask, cls_mask, txs, tys, tconf, tcls = \ build_targets(pred_corners, target.data, num_keypoints, nA, nC, nH, nW, self.noobject_scale, self.object_scale, self.thresh, self.seen) cls_mask = (cls_mask == 1) nProposals = int((conf > 0.25).sum().data[0]) for i in range(num_keypoints): txs[i] = Variable(txs[i].cuda()) tys[i] = Variable(tys[i].cuda()) tconf = Variable(tconf.cuda()) tcls = Variable(tcls[cls_mask].long().cuda()) coord_mask = Variable(coord_mask.cuda()) conf_mask = Variable(conf_mask.cuda().sqrt()) cls_mask = Variable(cls_mask.view(-1, 1).repeat(1,nC).cuda()) cls = cls[cls_mask].view(-1, nC) t3 = time.time() # Create loss loss_xs = list() loss_ys = list() for i in range(num_keypoints): loss_xs.append(self.coord_scale * nn.MSELoss(size_average=False)(x[i]*coord_mask, txs[i]*coord_mask)/2.0) loss_ys.append(self.coord_scale * nn.MSELoss(size_average=False)(y[i]*coord_mask, tys[i]*coord_mask)/2.0) loss_conf = nn.MSELoss(size_average=False)(conf*conf_mask, tconf*conf_mask)/2.0 loss_x = np.sum(loss_xs) loss_y = np.sum(loss_ys) if epoch > self.pretrain_num_epochs: loss = loss_x + loss_y + loss_conf # in single object pose estimation, there is no classification loss else: # pretrain initially without confidence loss # once the coordinate predictions get better, start training for confidence as well loss = loss_x + loss_y t4 = time.time() if False: print('-----------------------------------') print(' activation : %f' % (t1 - t0)) print(' create pred_corners : %f' % (t2 - t1)) print(' build targets : %f' % (t3 - t2)) print(' create loss : %f' % (t4 - t3)) print(' total : %f' % (t4 - t0)) print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, conf %f, total %f' % (self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_conf.data[0], loss.data[0])) return loss