CharlesShang · wuleibupt · Jul 4, 2017 · Jul 17, 2017 · Jul 17, 2017 · Jul 19, 2017
diff --git a/libs/boxes/anchor.py b/libs/boxes/anchor.py
@@ -21,9 +21,25 @@ def anchors_plane(height, width, stride = 1.0,
   # ratios = kwargs.setdefault('ratios', [0.5, 1, 2.0])
   # base = kwargs.setdefault('base', 16)
   anc = anchors(scales, ratios, base)
-  all_anchors = cython_anchor.anchors_plane(height, width, stride, anc)
+  all_anchors = cython_anchor.anchors_plane(height, width, stride, anc).astype(np.float32)
   return all_anchors
 
+def jitter_gt_boxes(gt_boxes, jitter=0.1):
+  """ jitter the gtboxes, before adding them into rois, to be more robust for cls and rgs
+  gt_boxes: (G, 5) [x1 ,y1 ,x2, y2, class] int
+  """
+  jittered_boxes = gt_boxes.copy()
+  ws = jittered_boxes[:, 2] - jittered_boxes[:, 0] + 1.0
+  hs = jittered_boxes[:, 3] - jittered_boxes[:, 1] + 1.0
+  width_offset = (np.random.rand(jittered_boxes.shape[0]) - 0.5) * jitter * ws
+  height_offset = (np.random.rand(jittered_boxes.shape[0]) - 0.5) * jitter * hs
+  jittered_boxes[:, 0] += width_offset
+  jittered_boxes[:, 2] += width_offset
+  jittered_boxes[:, 1] += height_offset
+  jittered_boxes[:, 3] += height_offset
+
+  return jittered_boxes
+
 # Written by Ross Girshick and Sean Bell
 def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
                      scales=2 ** np.arange(3, 6)):

diff --git a/libs/boxes/bbox_transform.py b/libs/boxes/bbox_transform.py
@@ -31,10 +31,16 @@ def bbox_transform(ex_rois, gt_rois):
 
     # warnings.catch_warnings()
     # warnings.filterwarnings('error')
-    targets_dx = 10.0 * (gt_ctr_x - ex_ctr_x) / ex_widths
-    targets_dy = 10.0 * (gt_ctr_y - ex_ctr_y) / ex_heights
-    targets_dw = 5.0 * np.log(gt_widths / ex_widths)
-    targets_dh = 5.0 * np.log(gt_heights / ex_heights)
+
+    # targets_dx = 10.0 * (gt_ctr_x - ex_ctr_x) / ex_widths
+    # targets_dy = 10.0 * (gt_ctr_y - ex_ctr_y) / ex_heights
+    # targets_dw = 5.0 * np.log(gt_widths / ex_widths)
+    # targets_dh = 5.0 * np.log(gt_heights / ex_heights)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
 
     targets = np.vstack(
         (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
@@ -51,10 +57,15 @@ def bbox_transform_inv(boxes, deltas):
     ctr_x = boxes[:, 0] + 0.5 * widths
     ctr_y = boxes[:, 1] + 0.5 * heights
 
-    dx = deltas[:, 0::4] * 0.1
-    dy = deltas[:, 1::4] * 0.1
-    dw = deltas[:, 2::4] * 0.2
-    dh = deltas[:, 3::4] * 0.2
+    # dx = deltas[:, 0::4] * 0.1
+    # dy = deltas[:, 1::4] * 0.1
+    # dw = deltas[:, 2::4] * 0.2
+    # dh = deltas[:, 3::4] * 0.2
+
+    dx = deltas[:, 0::4] 
+    dy = deltas[:, 1::4] 
+    dw = deltas[:, 2::4] 
+    dh = deltas[:, 3::4] 
 
     pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
     pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]

diff --git a/libs/configs/config_v1.py b/libs/configs/config_v1.py
@@ -26,7 +26,7 @@
 #                  dataset
 ##########################
 tf.app.flags.DEFINE_bool(
-    'update_bn', False,
+    'update_bn', True,
     'Whether or not to update bacth normalization layer')
 
 tf.app.flags.DEFINE_integer(
@@ -41,6 +41,10 @@
     'dataset_split_name', 'train2014',
     'The name of the train/test/val split.')
 
+tf.app.flags.DEFINE_string(
+    'dataset_split_name_test', 'train2014',#val2014
+    'The name of the test/val split.')
+
 tf.app.flags.DEFINE_string(
     'dataset_dir', 'data/coco/',
     'The directory where the dataset files are stored.')
@@ -75,7 +79,7 @@
 ######################
 
 tf.app.flags.DEFINE_float(
-    'weight_decay', 0.00005, 'The weight decay on the model weights.')
+    'weight_decay', 0.00001, 'The weight decay on the model weights.')
 
 tf.app.flags.DEFINE_string(
     'optimizer', 'momentum',
@@ -114,23 +118,25 @@
     'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.')
 
 tf.app.flags.DEFINE_float(
-    'momentum', 0.99,
+    'momentum', 0.9,
     'The momentum for the MomentumOptimizer and RMSPropOptimizer.')
 
 tf.app.flags.DEFINE_float('rmsprop_momentum', 0.99, 'Momentum.')
 
 tf.app.flags.DEFINE_float('rmsprop_decay', 0.99, 'Decay term for RMSProp.')
 
+tf.app.flags.DEFINE_float('batch_norm_decay', 0.9, 'Decay term for batch normalization.')
+
 #######################
 # Learning Rate Flags #
 #######################
 
 tf.app.flags.DEFINE_string(
-    'learning_rate_decay_type', 'exponential',
+    'learning_rate_decay_type', 'fixed',
     'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
     ' or "polynomial"')
 
-tf.app.flags.DEFINE_float('learning_rate', 0.002,
+tf.app.flags.DEFINE_float('learning_rate', 0.0001,#0.0002
                           'Initial learning rate.')
 
 tf.app.flags.DEFINE_float(
@@ -226,20 +232,21 @@
 #######################
 # BOX Flags #
 #######################
-tf.app.flags.DEFINE_float(
-    'rpn_bg_threshold', 0.3,
-    'Only regions which intersection is larger than fg_threshold are considered to be fg')
 
 tf.app.flags.DEFINE_float(
     'rpn_fg_threshold', 0.7,
     'Only regions which intersection is larger than fg_threshold are considered to be fg')
 
 tf.app.flags.DEFINE_float(
-    'fg_threshold', 0.7,
+    'rpn_bg_threshold', 0.3,
+    'Only regions which intersection is less than bg_threshold are considered to be fg')
+
+tf.app.flags.DEFINE_float(
+    'fg_threshold', 0.5,
     'Only regions which intersection is larger than fg_threshold are considered to be fg')
 
 tf.app.flags.DEFINE_float(
-    'bg_threshold', 0.3,
+    'bg_threshold', 0.5,
     'Only regions which intersection is less than bg_threshold are considered to be bg')
 
 tf.app.flags.DEFINE_integer(
@@ -255,12 +262,12 @@
     'Number of rois that should be sampled to train this network')
 
 tf.app.flags.DEFINE_integer(
-    'rpn_batch_size', 500,
+    'rpn_batch_size', 256,
     'Number of rpn anchors that should be sampled to train this network')
 
 tf.app.flags.DEFINE_integer(
-    'allow_border', 10,
-    'How many pixels out of an image')
+    'allow_border', 0.0,
+    'Percentage of bounding box height and length that are allowed to be out of an image boundary')
 
 ##################################
 #            NMS                #
@@ -274,9 +281,17 @@
     'post_nms_top_n', 2000,
     'Number of rpn anchors that should be sampled after nms')
 
+tf.app.flags.DEFINE_integer(
+    'post_nms_inst_n', 300,
+    "Number of inst after NMS")
+
 tf.app.flags.DEFINE_float(
     'rpn_nms_threshold', 0.7,
-    'NMS threshold')
+    'NMS threshold in RPN')
+
+tf.app.flags.DEFINE_float(
+    'mask_nms_threshold', 0.3,
+    'NMS threshold in mask network during testing')
 
 ##################################
 #            Mask                #
@@ -290,7 +305,7 @@
     'mask_threshold', 0.50,
     'Least intersection of a positive mask')
 tf.app.flags.DEFINE_integer(
-    'masks_per_image', 64,
+    'masks_per_image', 256,
     'Number of rois that should be sampled to train this network')
 
 tf.app.flags.DEFINE_float(

diff --git a/libs/datasets/coco.py b/libs/datasets/coco.py
@@ -90,12 +90,12 @@ def _height_decoder(keys_to_tensors):
     items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
     num_classes=_NUM_CLASSES)
 
-def read(tfrecords_filename):
+def read(tfrecords_filename, is_training=False):
 
   if not isinstance(tfrecords_filename, list):
     tfrecords_filename = [tfrecords_filename]
   filename_queue = tf.train.string_input_producer(
-    tfrecords_filename, num_epochs=100)
+    tfrecords_filename, shuffle=is_training)#, num_epochs=100
 
   options = tf.python_io.TFRecordOptions(TFRecordCompressionType.ZLIB)
   reader = tf.TFRecordReader(options=options)

diff --git a/libs/datasets/dataset_factory.py b/libs/datasets/dataset_factory.py
@@ -16,10 +16,10 @@ def get_dataset(dataset_name, split_name, dataset_dir,
         file_pattern = dataset_name + '_' + split_name + '*.tfrecord' 
 
     tfrecords = glob.glob(dataset_dir + '/records/' + file_pattern)
-    image, ih, iw, gt_boxes, gt_masks, num_instances, img_id = coco.read(tfrecords)
+    image, ih, iw, gt_boxes, gt_masks, num_instances, img_id = coco.read(tfrecords, is_training=is_training)
 
-    image, gt_boxes, gt_masks = coco_preprocess.preprocess_image(image, gt_boxes, gt_masks, is_training)
+    image, new_ih, new_iw, gt_boxes, gt_masks = coco_preprocess.preprocess_image(image, gt_boxes, gt_masks, is_training)
     #visualize_input(gt_boxes, image, tf.expand_dims(gt_masks, axis=3))
 
-    return image, ih, iw, gt_boxes, gt_masks, num_instances, img_id
+    return image, ih, iw, new_ih, new_iw, gt_boxes, gt_masks, num_instances, img_id
 
diff --git a/libs/datasets/download_and_convert_coco.py b/libs/datasets/download_and_convert_coco.py
@@ -218,8 +218,11 @@ def _get_coco_masks(coco, img_id, height, width, img_name):
   if bboxes.shape[0] <= 0:
     bboxes = np.zeros([0, 4], dtype=np.float32)
     classes = np.zeros([0], dtype=np.float32)
-    print ('None Annotations %s' % img_name)
-    LOG('None Annotations %s' % img_name)
+    #print ('None Annotations %s' % img_name)
+    #LOG('None Annotations %s' % img_name)
+    no_annotation_flag = True
+  else:
+    no_annotation_flag = False
   bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
   bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
   gt_boxes = np.hstack((bboxes, classes[:, np.newaxis]))
@@ -228,7 +231,7 @@ def _get_coco_masks(coco, img_id, height, width, img_name):
   mask = mask.astype(np.uint8)
   assert masks.shape[0] == gt_boxes.shape[0], 'Shape Error'
 
-  return gt_boxes, masks, mask
+  return gt_boxes, masks, mask, no_annotation_flag
 
 
 
@@ -286,11 +289,24 @@ def _add_to_tfrecord(record_dir, image_dir, annotation_dir, split_name):
 
             # jump over the damaged images
             if str(img_id) == '320612':
+              sys.stdout.write('\r>> skipping image %d/%d shard %d\n' % (
+                  i + 1, len(imgs), shard_id))
+              sys.stdout.flush()
               continue
 
             # process anns
             height, width = imgs[i][1]['height'], imgs[i][1]['width']
-            gt_boxes, masks, mask = _get_coco_masks(coco, img_id, height, width, img_name)
+            if float(height)/float(width) > 3.02 or float(width)/float(height) > 3.02:
+              sys.stdout.write('\r>> skipping image %d/%d shard %d height:%d width:%d\n' % (
+                  i + 1, len(imgs), shard_id, height, width))
+              sys.stdout.flush()
+              continue
+            gt_boxes, masks, mask, no_annotation_flag = _get_coco_masks(coco, img_id, height, width, img_name)
+            if no_annotation_flag is True:
+              sys.stdout.write('\r>> skipping image %d/%d shard %d no annotation \n' % (
+                  i + 1, len(imgs), shard_id))
+              sys.stdout.flush()
+              continue
 
             # read image as RGB numpy
             img = np.array(Image.open(img_name))
@@ -402,7 +418,12 @@ def is_in_minival(img_id, minival):
             height, width = imgs[i][1]['height'], imgs[i][1]['width']
             coco = coco_train if i < num_of_train else coco_val
 
-            gt_boxes, masks, mask = _get_coco_masks(coco, img_id, height, width, img_name)
+            gt_boxes, masks, mask, no_annotation_flag = _get_coco_masks(coco, img_id, height, width, img_name)
+            if no_annotation_flag is True:
+              sys.stdout.write('\r>> skipping image %d/%d shard %d no annotation \n' % (
+                  i + 1, len(imgs), shard_id))
+              sys.stdout.flush()
+              continue
 
             # read image as RGB numpy
             img = np.array(Image.open(img_name))