add refinedet 320

lzx1413 · lzx1413 · commit dd7b8e15bb06 · 2018-03-24T17:39:50.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
 *.so
+__pycache__
+build
diff --git a/README.md b/README.md
@@ -3,6 +3,7 @@
 * SSD [SSD: Single Shot Multibox  Detector](https://arxiv.org/abs/1512.02325)
 * FSSD [FSSD: Feature Fusion Single Shot Multibox Detector](https://arxiv.org/abs/1712.00960)
 * RFB-SSD[Receptive Field Block Net for Accurate and Fast Object Detection](https://arxiv.org/abs/1711.07767)
+* RefindeDet[Single-Shot Refinement Neural Network for Object Detection](https://arxiv.org/pdf/1711.06897.pdf)
 
 ### VOC2007 Test
 | System                                   |  *mAP*   | **FPS** (Titan X Maxwell) |
diff --git a/data/config.py b/data/config.py
@@ -99,3 +99,21 @@
 
     'clip' : True,
 }
+
+VOC_320 = {
+    'feature_maps' : [40, 20, 10, 5],
+
+    'min_dim' : 320,
+
+    'steps' : [8, 16, 32, 64],
+
+    'min_sizes' : [32, 64, 128, 256],
+
+    'max_sizes' : [],
+
+    'aspect_ratios' : [[2], [2], [2], [2]],
+
+    'variance' : [0.1, 0.2],
+
+    'clip' : True,
+}
diff --git a/layers/functions/detection.py b/layers/functions/detection.py
@@ -3,7 +3,7 @@
 import torch.backends.cudnn as cudnn
 from torch.autograd import Function
 from torch.autograd import Variable
-from utils.box_utils import decode, nms
+from utils.box_utils import decode, nms,center_size
 
 
 class Detect(Function):
@@ -12,15 +12,16 @@ class Detect(Function):
     scores and threshold to a top_k number of output predictions for both
     confidence score and locations.
     """
-    def __init__(self, num_classes, bkg_label, cfg):
+    def __init__(self, num_classes, bkg_label, cfg,object_score = 0):
         self.num_classes = num_classes
         self.background_label = bkg_label
+        self.object_score = object_score
         #self.thresh = thresh
 
         # Parameters used in nms.
         self.variance = cfg['variance']
 
-    def forward(self, predictions, prior):
+    def forward(self, predictions, prior,arm_data = None):
         """
         Args:
             loc_data: (tensor) Loc preds from loc layers
@@ -32,28 +33,39 @@ def forward(self, predictions, prior):
         """
 
         loc, conf = predictions
-
         loc_data = loc.data
         conf_data = conf.data
         prior_data = prior.data
         num = loc_data.size(0)  # batch size
+        if arm_data:
+            arm_loc,arm_conf = arm_data
+            arm_loc_data = arm_loc.data
+            arm_conf_data = arm_conf.data
+            arm_object_conf = arm_conf_data[:,1:]
+            no_object_index = arm_object_conf<=self.object_score
+            conf_data[no_object_index.expand_as(conf_data)] = 0
+
         self.num_priors = prior_data.size(0)
-        self.boxes = torch.zeros(1, self.num_priors, 4)
-        self.scores = torch.zeros(1, self.num_priors, self.num_classes)
+        self.boxes = torch.zeros(num, self.num_priors, 4)
+        self.scores = torch.zeros(num, self.num_priors, self.num_classes)
 
         if num == 1:
             # size batch x num_classes x num_priors
             conf_preds = conf_data.unsqueeze(0)
 
         else:
-            conf_preds = conf_data.view(num, num_priors,
+            conf_preds = conf_data.view(num, self.num_priors,
                                         self.num_classes)
-            self.boxes.expand_(num, self.num_priors, 4)
-            self.scores.expand_(num, self.num_priors, self.num_classes)
-
+            self.boxes.expand(num, self.num_priors, 4)
+            self.scores.expand(num, self.num_priors, self.num_classes)
         # Decode predictions into bboxes.
         for i in range(num):
-            decoded_boxes = decode(loc_data[i], prior_data, self.variance)
+            if arm_data:
+                default = decode(arm_loc_data[i],prior_data,self.variance)
+                default = center_size(default)
+            else:
+                default = prior_data
+            decoded_boxes = decode(loc_data[i], default, self.variance)
             # For each class, perform nms
             conf_scores = conf_preds[i].clone()
             '''
diff --git a/layers/functions/prior_box.py b/layers/functions/prior_box.py
@@ -45,8 +45,9 @@ def forward(self):
 
                 # aspect_ratio: 1
                 # rel size: sqrt(s_k * s_(k+1))
-                s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
-                mean += [cx, cy, s_k_prime, s_k_prime]
+                if self.max_sizes:
+                    s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
+                    mean += [cx, cy, s_k_prime, s_k_prime]
 
                 # rest of aspect ratios
                 for ar in self.aspect_ratios[k]:
diff --git a/layers/modules/__init__.py b/layers/modules/__init__.py
@@ -1,4 +1,5 @@
 from .multibox_loss import MultiBoxLoss
+from .refine_multibox_loss import RefineMultiBoxLoss
 from .l2norm import L2Norm
 
 __all__ = ['MultiBoxLoss','L2Norm']
diff --git a/layers/modules/refine_multibox_loss.py b/layers/modules/refine_multibox_loss.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from utils.box_utils import match,refine_match, log_sum_exp,decode
+GPU = False
+if torch.cuda.is_available():
+    GPU = True
+    torch.set_default_tensor_type('torch.cuda.FloatTensor')
+
+
+class RefineMultiBoxLoss(nn.Module):
+    """SSD Weighted Loss Function
+    Compute Targets:
+        1) Produce Confidence Target Indices by matching  ground truth boxes
+           with (default) 'priorboxes' that have jaccard index > threshold parameter
+           (default threshold: 0.5).
+        2) Produce localization target by 'encoding' variance into offsets of ground
+           truth boxes and their matched  'priorboxes'.
+        3) Hard negative mining to filter the excessive number of negative examples
+           that comes with using a large number of default bounding boxes.
+           (default negative:positive ratio 3:1)
+    Objective Loss:
+        L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
+        Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
+        weighted by α which is set to 1 by cross val.
+        Args:
+            c: class confidences,
+            l: predicted boxes,
+            g: ground truth boxes
+            N: number of matched default boxes
+        See: https://arxiv.org/pdf/1512.02325.pdf for more details.
+    """
+
+
+    def __init__(self, num_classes,overlap_thresh,prior_for_matching,bkg_label,neg_mining,neg_pos,neg_overlap,encode_target,object_score = 0):
+        super(RefineMultiBoxLoss, self).__init__()
+        self.num_classes = num_classes
+        self.threshold = overlap_thresh
+        self.background_label = bkg_label
+        self.encode_target = encode_target
+        self.use_prior_for_matching  = prior_for_matching
+        self.do_neg_mining = neg_mining
+        self.negpos_ratio = neg_pos
+        self.neg_overlap = neg_overlap
+        self.object_score = object_score
+        self.variance = [0.1,0.2]
+
+    def forward(self, odm_data,priors, targets,arm_data = None,filter_object = False):
+        """Multibox Loss
+        Args:
+            predictions (tuple): A tuple containing loc preds, conf preds,
+            and prior boxes from SSD net.
+                conf shape: torch.size(batch_size,num_priors,num_classes)
+                loc shape: torch.size(batch_size,num_priors,4)
+                priors shape: torch.size(num_priors,4)
+
+            ground_truth (tensor): Ground truth boxes and labels for a batch,
+                shape: [batch_size,num_objs,5] (last idx is the label).
+            arm_data (tuple): arm branch containg arm_loc and arm_conf
+            filter_object: whether filter out the  prediction according to the arm conf score
+        """
+
+        loc_data,conf_data = odm_data
+        if arm_data:
+            arm_loc,arm_conf = arm_data
+        priors = priors.data
+        num = loc_data.size(0)
+        num_priors = (priors.size(0))
+
+        # match priors (default boxes) and ground truth boxes
+        loc_t = torch.Tensor(num, num_priors, 4)
+        conf_t = torch.LongTensor(num, num_priors)
+        for idx in range(num):
+            truths = targets[idx][:,:-1].data
+            labels = targets[idx][:,-1].data
+            #for object detection
+            if self.num_classes == 2:
+                labels = labels > 0
+            if arm_data:
+                refine_match(self.threshold,truths,priors,self.variance,labels,loc_t,conf_t,idx,arm_loc[idx].data)
+            else:
+                match(self.threshold,truths,priors,self.variance,labels,loc_t,conf_t,idx)
+        if GPU:
+            loc_t = loc_t.cuda()
+            conf_t = conf_t.cuda()
+        # wrap targets
+        loc_t = Variable(loc_t, requires_grad=False)
+        conf_t = Variable(conf_t,requires_grad=False)
+        if arm_data and filter_object:
+            arm_conf_data = arm_conf.data[:,:,1]
+            pos = conf_t > 0
+            object_score_index = arm_conf_data <= self.object_score
+            pos[object_score_index] = 0
+
+        else:
+            pos = conf_t > 0
+
+        # Localization Loss (Smooth L1)
+        # Shape: [batch,num_priors,4]
+        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
+        loc_p = loc_data[pos_idx].view(-1,4)
+        loc_t = loc_t[pos_idx].view(-1,4)
+        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
+
+        # Compute max conf across batch for hard negative mining
+        batch_conf = conf_data.view(-1,self.num_classes)
+        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1,1))
+
+        # Hard Negative Mining
+        loss_c[pos] = 0 # filter out pos boxes for now
+        loss_c = loss_c.view(num, -1)
+        _,loss_idx = loss_c.sort(1, descending=True)
+        _,idx_rank = loss_idx.sort(1)
+        num_pos = pos.long().sum(1,keepdim=True)
+        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
+        neg = idx_rank < num_neg.expand_as(idx_rank)
+
+        # Confidence Loss Including Positive and Negative Examples
+        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
+        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
+        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes)
+        targets_weighted = conf_t[(pos+neg).gt(0)]
+        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
+
+        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
+        N = num_pos.data.sum()
+        loss_l/=N
+        loss_c/=N
+        return loss_l,loss_c
diff --git a/models/RefineSSD_vgg.py b/models/RefineSSD_vgg.py
diff --git a/refinedet_train_test.py b/refinedet_train_test.py
diff --git a/utils/box_utils.py b/utils/box_utils.py