nilboy · deepindeed2022 · Mar 13, 2017 · Mar 13, 2017 · Mar 17, 2017 · Mar 24, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+*/*.pyc
+*/*/*/.pyc
diff --git a/demo.py b/demo.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
 import sys
 
 sys.path.append('./')
@@ -7,18 +9,32 @@
 import cv2
 import numpy as np
 
-classes_name =  ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"]
+classes_name =  ["aeroplane", "bicycle", "bird", "boat", "bottle", 
+                "bus", "car", "cat", "chair", "cow", 
+                "diningtable", "dog", "horse", "motorbike", "person", 
+                "pottedplant", "sheep", "sofa", "train","tvmonitor"]
 
+common_params = { 'image_size': 448, 
+                  'num_classes': 20, 
+                  'batch_size':1}
+
+net_params = {'cell_size': 7, 
+              'boxes_per_cell':2, 
+              'weight_decay': 0.0005}
 
 def process_predicts(predicts):
+  """
+  对于规范化的输出结果对于特定的用户可能觉得不习惯，那么实现一个接口，将规范化
+  的结果重新编写为用户习惯的数据类型
+  """
   p_classes = predicts[0, :, :, 0:20]
   C = predicts[0, :, :, 20:22]
   coordinate = predicts[0, :, :, 22:]
-
+  # 训练的模型设置超参数 net_params, 其中cell大小设置为7
   p_classes = np.reshape(p_classes, (7, 7, 1, 20))
   C = np.reshape(C, (7, 7, 2, 1))
 
-  P = C * p_classes
+  P = C * p_classes   # P size = (7, 7, 2, 20)
 
   #print P[5,1, 0, :]
 
@@ -51,36 +67,42 @@ def process_predicts(predicts):
 
   return xmin, ymin, xmax, ymax, class_num
 
-common_params = {'image_size': 448, 'num_classes': 20, 
-                'batch_size':1}
-net_params = {'cell_size': 7, 'boxes_per_cell':2, 'weight_decay': 0.0005}
-
-net = YoloTinyNet(common_params, net_params, test=True)
-
-image = tf.placeholder(tf.float32, (1, 448, 448, 3))
-predicts = net.inference(image)
+def main():
 
-sess = tf.Session()
 
-np_img = cv2.imread('cat.jpg')
-resized_img = cv2.resize(np_img, (448, 448))
-np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
+  net = YoloTinyNet(common_params, net_params, test=True)
+  # tensorflow中声明占位符号image, 这在后面run的时候
+  # feed_dict中会出现该占位符和对应的值，意思就是输入数据的来源
+  image = tf.placeholder(tf.float32, (1, 448, 448, 3))
+  predicts = net.inference(image)
 
+  sess = tf.Session()
 
-np_img = np_img.astype(np.float32)
+  # 转化数据格式
+  np_img = cv2.imread('cat.jpg')
+  resized_img = cv2.resize(np_img, (448, 448))
+  np_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
 
-np_img = np_img / 255.0 * 2 - 1
-np_img = np.reshape(np_img, (1, 448, 448, 3))
+  np_img = np_img.astype(np.float32)
+  #白化输入的数据
+  np_img = np_img / 255.0 * 2 - 1
+  np_img = np.reshape(np_img, (1, 448, 448, 3))
 
-saver = tf.train.Saver(net.trainable_collection)
+  saver = tf.train.Saver(net.trainable_collection)
 
-saver.restore(sess, 'models/pretrain/yolo_tiny.ckpt')
+  saver.restore(sess, 'models/pretrain/yolo_tiny.ckpt')
+  # The optional feed_dict argument allows the caller to override 
+  # the value of tensors in the graph. 
+  np_predict = sess.run(predicts, feed_dict={image: np_img})
 
-np_predict = sess.run(predicts, feed_dict={image: np_img})
+  xmin, ymin, xmax, ymax, class_num = process_predicts(np_predict)
+  class_name = classes_name[class_num]
+  # 绘制预测框, 输出预测类型
+  cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255))
+  cv2.putText(resized_img, 
+              class_name, (int(xmin), int(ymin)), 2, 1.5, (0, 0, 255))
+  cv2.imwrite('cat_out.jpg', resized_img)
+  sess.close()
 
-xmin, ymin, xmax, ymax, class_num = process_predicts(np_predict)
-class_name = classes_name[class_num]
-cv2.rectangle(resized_img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255))
-cv2.putText(resized_img, class_name, (int(xmin), int(ymin)), 2, 1.5, (0, 0, 255))
-cv2.imwrite('cat_out.jpg', resized_img)
-sess.close()
+if __name__ == '__main__':
+  main()
diff --git a/tools/preprocess_pascal_voc.py b/tools/preprocess_pascal_voc.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
 """preprocess pascal_voc data
 """
 import os

diff --git a/tools/train.py b/tools/train.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
 import sys
 from optparse import OptionParser
 

diff --git a/yolo/dataset/dataset.py b/yolo/dataset/dataset.py
@@ -1,3 +1,6 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
+
 """DataSet  base class 
 """
 class DataSet(object):

diff --git a/yolo/dataset/text_dataset.py b/yolo/dataset/text_dataset.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -12,11 +14,13 @@
 
 from yolo.dataset.dataset import DataSet 
 
+
 class TextDataSet(DataSet):
   """TextDataSet
   process text input file dataset 
   text file format:
     image_path xmin1 ymin1 xmax1 ymax1 class1 xmin2 ymin2 xmax2 ymax2 class2
+  设计思想是采用生产者消费者模式，
   """
 
   def __init__(self, common_params, dataset_params):
@@ -52,11 +56,12 @@ def __init__(self, common_params, dataset_params):
     self.record_number = len(self.record_list)
 
     self.num_batch_per_epoch = int(self.record_number / self.batch_size)
-
+    # 创建生产者守护进程并启动
     t_record_producer = Thread(target=self.record_producer)
     t_record_producer.daemon = True 
     t_record_producer.start()
-
+
+    # 创建thread_num个消费者守护进程并启动
     for i in range(self.thread_num):
       t = Thread(target=self.record_customer)
       t.daemon = True
@@ -109,6 +114,12 @@ def record_process(self, record):
       labels[object_num] = [xcenter, ycenter, box_w, box_h, class_num]
       object_num += 1
       i += 5
+      # TODO:
+      # 这个地方会不会忽略掉一些显著特征呢？
+      # 因为self.max_objects是自定义的变量，在读取的过程中，
+      # 仅仅读取前面的数据的话，后面的会被忽略掉的。
+      # TODO: 训练数据中每张图片的物体是如何给出的，是根据显著性呢还是根据
+      # 起始点为位置大小给出的呢，这个需要check一下
       if object_num >= self.max_objects:
         break
     return [image, labels, object_num]
@@ -124,9 +135,9 @@ def record_customer(self):
   def batch(self):
     """get batch
     Returns:
-      images: 4-D ndarray [batch_size, height, width, 3]
-      labels: 3-D ndarray [batch_size, max_objects, 5]
-      objects_num: 1-D ndarray [batch_size]
+      images: 4-D ndarray [batch_size, height, width, 3] 一个batch中所有图片数据
+      labels: 3-D ndarray [batch_size, max_objects, 5] 一个batch中的所有图片的中的所有物体的标签
+      objects_num: 1-D ndarray [batch_size] 一个batch中每个图片中object的个数
     """
     images = []
     labels = []

diff --git a/yolo/net/net.py b/yolo/net/net.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -51,7 +53,6 @@ def _variable_with_weight_decay(self, name, shape, stddev, wd, pretrain=True, tr
       stddev: standard devision of a truncated Gaussian
       wd: add L2Loss weight decay multiplied by this float. If None, weight 
       decay is not added for this Variable.
-
    Returns:
       Variable Tensor 
     """
@@ -74,16 +75,19 @@ def conv2d(self, scope, input, kernel_size, stride=1, pretrain=True, train=True)
       output: 4-D tensor [batch_size, height/stride, width/stride, out_channels]
     """
     with tf.variable_scope(scope) as scope:
+      # 初始化权重的kernel
       kernel = self._variable_with_weight_decay('weights', 
                                       shape=kernel_size,
                                       stddev=5e-2,
                                       wd=self.weight_decay, pretrain=pretrain, train=train)
       conv = tf.nn.conv2d(input, kernel, [1, stride, stride, 1], padding='SAME')
+
+      # biases 初始化采用常数 0.0 初始化
       biases = self._variable_on_cpu('biases', kernel_size[3:], tf.constant_initializer(0.0), pretrain, train)
-      bias = tf.nn.bias_add(conv, biases)
-      conv1 = self.leaky_relu(bias)
+      conv1 = tf.nn.bias_add(conv, biases)
+      output = self.leaky_relu(conv1)
 
-    return conv1
+    return output
 
 
   def max_pool(self, input, kernel_size, stride):
@@ -99,7 +103,7 @@ def max_pool(self, input, kernel_size, stride):
     return tf.nn.max_pool(input, ksize=[1, kernel_size[0], kernel_size[1], 1], strides=[1, stride, stride, 1],
                   padding='SAME')
 
-  def local(self, scope, input, in_dimension, out_dimension, leaky=True, pretrain=True, train=True):
+  def local(self, scope, _input, in_dimension, out_dimension, leaky=True, pretrain=True, train=True):
     """Fully connection layer
 
     Args:
@@ -110,7 +114,7 @@ def local(self, scope, input, in_dimension, out_dimension, leaky=True, pretrain=
       output: 2-D tensor [batch_size, out_dimension]
     """
     with tf.variable_scope(scope) as scope:
-      reshape = tf.reshape(input, [tf.shape(input)[0], -1])
+      reshape = tf.reshape(_input, [tf.shape(_input)[0], -1])
 
       weights = self._variable_with_weight_decay('weights', shape=[in_dimension, out_dimension],
                           stddev=0.04, wd=self.weight_decay, pretrain=pretrain, train=train)
@@ -137,6 +141,9 @@ def leaky_relu(self, x, alpha=0.1, dtype=tf.float32):
       y : Tensor
     """
     x = tf.cast(x, dtype=dtype)
+    # 对输入的特征向量进行leaky_relu
+    # 其中对>0的数据采用直接激活的方式，对小于0的数据采用leaky激活方式
+    # 此处实现值得学习和借鉴
     bool_mask = (x > 0)
     mask = tf.cast(bool_mask, dtype=dtype)
     return 1.0 * mask * x + alpha * (1 - mask) * x
@@ -160,4 +167,13 @@ def loss(self, predicts, labels, objects_num):
       labels  : 3-D tensor of [batch_size, max_objects, 5]
       objects_num: 1-D tensor [batch_size]
     """
-    raise NotImplementedError
+    raise NotImplementedError
+
+'''
+## weight decay：
+在机器学习或者模式识别中，会出现overfitting，而当网络逐渐overfitting时网络
+权值逐渐变大，因此，为了避免出现overfitting,会给误差函数添加一个惩罚项，常用
+的惩罚项是所有权重的平方乘以一个衰减常量之和。其用来惩罚大的权值。
+权值衰减惩罚项使得权值收敛到较小的绝对值，而惩罚大的权值。因为大的权值会使得
+系统出现过拟合，降低其泛化性能。
+'''
diff --git a/yolo/net/yolo_net.py b/yolo/net/yolo_net.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python 
+# -*- coding: utf-8 -*- 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -123,6 +125,7 @@ def iou(self, boxes1, boxes2):
     Return:
       iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
     """
+    # 计算左上角和右下角的位置信息
     boxes1 = tf.pack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
                       boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2])
     boxes1 = tf.transpose(boxes1, [1, 2, 3, 0])
@@ -134,6 +137,16 @@ def iou(self, boxes1, boxes2):
     rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:])
 
     #intersection
+    '''
+    0, 0------------------------>
+    |    ————————————|
+    |    |     ——————|——————    
+    |    |     |     |      |
+    |    |—————|—————|      |
+    |          |____________| 
+    |
+    v
+    '''
     intersection = rd - lu 
 
     inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1]
@@ -285,7 +298,17 @@ def loss(self, predicts, labels, objects_num):
       label = labels[i, :, :]
       object_num = objects_num[i]
       nilboy = tf.ones([7,7,2])
-      tuple_results = tf.while_loop(self.cond1, self.body1, [tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label, nilboy])
+      tuple_results = tf.while_loop(
+                        self.cond1, 
+                        self.body1, 
+                        [
+                            tf.constant(0), 
+                            object_num, 
+                            [class_loss, object_loss, noobject_loss, coord_loss], 
+                            predict, 
+                            label, 
+                            nilboy
+                        ])
       for j in range(4):
         loss[j] = loss[j] + tuple_results[2][j]
       nilboy = tuple_results[5]