From f85737ed6a68dc0548dd986d3a93549089940a72 Mon Sep 17 00:00:00 2001
From: vbvg2008 <shawnmengdong@gmail.com>
Date: Wed, 26 Jun 2019 06:04:03 -0700
Subject: [PATCH] training example for retinanet

---
 image_detection/retinanet_svhn.py | 105 ++++++++++++++++++++++++++++++
 image_detection/svhn_data.py      |  19 ++++--
 2 files changed, 118 insertions(+), 6 deletions(-)
 create mode 100644 image_detection/retinanet_svhn.py

diff --git a/image_detection/retinanet_svhn.py b/image_detection/retinanet_svhn.py
new file mode 100644
index 0000000..2facef2
--- /dev/null
+++ b/image_detection/retinanet_svhn.py
@@ -0,0 +1,105 @@
+from fastestimator.pipeline.dynamic.preprocess import AbstractPreprocessing as AbstractPreprocessingD
+from fastestimator.architecture.retinanet import RetinaNet, get_fpn_anchor_box, get_target
+from fastestimator.pipeline.dynamic.preprocess import ImageReader
+from fastestimator.pipeline.static.preprocess import Minmax
+from fastestimator.estimator.estimator import Estimator
+from fastestimator.pipeline.pipeline import Pipeline
+from fastestimator.estimator.trace import Accuracy
+import tensorflow as tf
+import numpy as np
+import svhn_data
+import cv2
+
+class Network:
+    def __init__(self):
+        self.model = RetinaNet(input_shape=(64, 64, 3), num_classes=10)
+        self.optimizer = tf.optimizers.Adam()
+        self.loss = MyLoss()
+
+    def train_op(self, batch):
+        with tf.GradientTape() as tape:
+            predictions = self.model(batch["image"])
+            loss = self.loss((batch["target_cls"], batch["target_loc"]), predictions)
+        gradients = tape.gradient(loss, self.model.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
+        return predictions, loss
+
+    def eval_op(self, batch):
+        predictions = self.model(batch["image"], training=False)
+        loss = self.loss((batch["target_cls"], batch["target_loc"]), predictions)
+        return predictions, loss
+
+class MyPipeline(Pipeline):
+    def edit_feature(self, feature):
+        height, width = feature["image"].shape[0], feature["image"].shape[1]
+        feature["x1"], feature["y1"], feature["x2"], feature["y2"] = feature["x1"]/width, feature["y1"]/height, feature["x2"]/width, feature["y2"]/height
+        feature["image"] = cv2.resize(feature["image"], (64, 64))
+        anchorbox = get_fpn_anchor_box(input_shape=feature["image"].shape)
+        target_cls, target_loc = get_target(anchorbox, feature["label"], feature["x1"], feature["y1"], feature["x2"], feature["y2"], num_classes=10)
+        feature["target_cls"], feature["target_loc"] = target_cls, target_loc
+        return feature
+
+class String2List(AbstractPreprocessingD):
+    #this thing converts '[1, 2, 3]' into np.array([1, 2, 3])
+    def transform(self, data):
+        data = np.array([int(x) for x in data[1:-1].split(',')])
+        return data
+
+class MyLoss(tf.losses.Loss):
+    def call(self, y_true, y_pred):
+        cls_gt, loc_gt = tuple(y_true)
+        cls_pred, loc_pred = tuple(y_pred)
+        focal_loss, obj_idx = self.focal_loss(cls_gt, cls_pred, num_classes=10)
+        smooth_l1_loss = self.smooth_l1(loc_gt, loc_pred, obj_idx)
+        return focal_loss+smooth_l1_loss
+
+    def focal_loss(self, cls_gt, cls_pred, num_classes, alpha=0.25, gamma=2.0):
+        #cls_gt has shape [B, A], cls_pred is in [B, A, K]
+        obj_idx = tf.where(tf.greater_equal(cls_gt, 0)) #index of object
+        obj_bg_idx = tf.where(tf.greater_equal(cls_gt, -1)) #index of object and background
+        cls_gt = tf.one_hot(cls_gt, num_classes)
+        cls_gt = tf.gather_nd(cls_gt, obj_bg_idx)
+        cls_pred = tf.gather_nd(cls_pred, obj_bg_idx)
+        #getting the object count for each image in batch
+        _, idx, count = tf.unique_with_counts(obj_bg_idx[:,0])
+        object_count = tf.gather_nd(count, tf.reshape(idx, (-1, 1)))
+        object_count = tf.tile(tf.reshape(object_count,(-1, 1)), [1,num_classes])
+        object_count = tf.cast(object_count, tf.float32)
+        #reshape to the correct shape
+        cls_gt = tf.reshape(cls_gt, (-1, 1))
+        cls_pred = tf.reshape(cls_pred, (-1, 1))
+        object_count = tf.reshape(object_count, (-1, 1))
+        # compute the focal weight on each selected anchor box
+        alpha_factor = tf.ones_like(cls_gt) * alpha
+        alpha_factor = tf.where(tf.equal(cls_gt, 1), alpha_factor, 1 - alpha_factor)
+        focal_weight = tf.where(tf.equal(cls_gt, 1), 1 - cls_pred, cls_pred)
+        focal_weight = alpha_factor * focal_weight ** gamma / object_count
+        focal_loss =  tf.losses.BinaryCrossentropy()(cls_gt, cls_pred, sample_weight=focal_weight)
+        return focal_loss, obj_idx
+
+    def smooth_l1(self, loc_gt, loc_pred, obj_idx):
+        #loc_gt anf loc_pred has shape [B, A, 4]
+        loc_gt = tf.gather_nd(loc_gt, obj_idx)
+        loc_pred = tf.gather_nd(loc_pred, obj_idx)
+        loc_gt = tf.reshape(loc_gt, (-1, 1))
+        loc_pred = tf.reshape(loc_pred, (-1, 1))
+        loc_diff = tf.abs(loc_gt - loc_pred)
+        smooth_l1_loss = tf.where(tf.less(loc_diff,1), 0.5 * loc_diff**2, loc_diff-0.5)
+        smooth_l1_loss = tf.reduce_mean(smooth_l1_loss)
+        return smooth_l1_loss
+
+def get_estimator():
+    train_csv, test_csv, path = svhn_data.load_data()
+
+    pipeline = MyPipeline(batch_size=256,
+                          feature_name=["image", "label", "x1", "y1", "x2", "y2", "target_cls", "target_loc"],
+                          train_data=train_csv,
+                          validation_data=test_csv,
+                          transform_dataset=[[ImageReader(parent_path=path)], [String2List()], [String2List()], [String2List()], [String2List()], [String2List()], [],[]],
+                          transform_train= [[Minmax()], [], [], [],[],[],[],[]],
+                          padded_batch=True)
+    
+    estimator = Estimator(network= Network(),
+                          pipeline=pipeline,
+                          epochs= 10)
+    return estimator
\ No newline at end of file
diff --git a/image_detection/svhn_data.py b/image_detection/svhn_data.py
index d823d95..12dda75 100644
--- a/image_detection/svhn_data.py
+++ b/image_detection/svhn_data.py
@@ -1,6 +1,8 @@
 import os
 import tarfile
 import tempfile
+from operator import add
+
 import h5py
 import numpy as np
 import pandas as pd
@@ -16,8 +18,7 @@ def get_bbox(index, hdf5_data):
     item = hdf5_data['digitStruct']['bbox'][index].item()
     for key in ['label', 'left', 'top', 'width', 'height']:
         attr = hdf5_data[item][key]
-        values = [hdf5_data[attr.value[i].item()].value[0][0]
-                  for i in range(len(attr))] if len(attr) > 1 else [attr.value[0][0]]
+        values = [int(hdf5_data[attr.value[i].item()].value[0][0]) for i in range(len(attr))] if len(attr) > 1 else [int(attr.value[0][0])]
         attrs[key] = values
     return attrs
 
@@ -31,10 +32,15 @@ def img_boundingbox_data_constructor(data_folder, mode, csv_path):
         if j % logging_interval == 0:
             print("retrieving bounding box for %s: %f%%" % (mode, j/num_example*100))
         img_name = get_name(j, f)
-        row_dict = get_bbox(j, f)
-        row_dict['img_name'] = os.path.join(mode, img_name)
+        bbox = get_bbox(j, f)
+        row_dict = {'image': os.path.join(mode, img_name),
+                    'label': bbox["label"],
+                    'x1': bbox["left"],
+                    'y1': bbox["top"],
+                    'x2': list(map(add, bbox["left"], bbox["width"])),
+                    'y2': list(map(add, bbox["top"], bbox["height"]))}
         row_list.append(row_dict)
-    bbox_df = pd.DataFrame(row_list, columns=['img_name','label','left','top','width','height'])
+    bbox_df = pd.DataFrame(row_list, columns=['image','label','x1','y1','x2','y2'])
     bbox_df.to_csv(csv_path, index=False)
     return bbox_df
 
@@ -44,7 +50,7 @@ def load_data(path=None):
     if not os.path.exists(path):
         os.mkdir(path)
     train_csv = os.path.join(path, "train_data.csv")
-    test_csv = os.path.join(path, "eval_data.csv")
+    test_csv = os.path.join(path, "test_data.csv")
     if not (os.path.exists(os.path.join(path, "train.tar.gz")) and os.path.exists(os.path.join(path, "test.tar.gz"))):
         print("downloading data to %s" % path)
         wget.download('http://ufldl.stanford.edu/housenumbers/train.tar.gz', path)
@@ -57,6 +63,7 @@ def load_data(path=None):
         test_file.extractall(path=path)
         train_file.extractall(path=path)
     if not (os.path.exists(train_csv) and os.path.exists(test_csv)):
+        print("constructing bounding box data...")
         train_folder = os.path.join(path, "train")
         test_folder = os.path.join(path, "test")
         img_boundingbox_data_constructor(train_folder, "train", train_csv)