Skip to content

Commit a7e397a

Browse files
committed
Merge pull request #482 from shelhamer/rcnn-detector-example
Make R-CNN the Caffe detection example
2 parents 63c7429 + 9882d47 commit a7e397a

File tree

7 files changed

+801
-294
lines changed

7 files changed

+801
-294
lines changed

docs/getting_pretrained_models.md

+2
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,6 @@ This page will be updated as more models become available.
2424
- The best validation performance during training was iteration 358,000 with
2525
validation accuracy 57.258% and loss 1.83948.
2626

27+
**R-CNN (ILSVRC13)**: The pure Caffe instantiation of the [R-CNN](https://github.com/rbgirshick/rcnn) model for ILSVRC13 detection. Download the model (230.8MB) by running `examples/imagenet/get_caffe_rcnn_imagenet_model.sh` from the Caffe root directory. This model was made by transplanting the R-CNN SVM classifiers into a `fc-rcnn` classification layer, provided here as an off-the-shelf Caffe detector. Try the [detection example](http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/detection.ipynb) to see it in action. For the full details, refer to the R-CNN site. *N.B. For research purposes, make use of the official R-CNN package and not this example.*
28+
2729
Additionally, you will probably eventually need some auxiliary data (mean image, synset list, etc.): run `data/ilsvrc12/get_ilsvrc_aux.sh` from the root directory to obtain it.

examples/detection.ipynb

+467-283
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env sh
2+
# This scripts downloads the Caffe R-CNN ImageNet
3+
# for ILSVRC13 detection.
4+
5+
MODEL=caffe_rcnn_imagenet_model
6+
CHECKSUM=42c1556d2d47a9128c4a90e0a9c5341c
7+
8+
if [ -f $MODEL ]; then
9+
echo "Model already exists. Checking md5..."
10+
os=`uname -s`
11+
if [ "$os" = "Linux" ]; then
12+
checksum=`md5sum $MODEL | awk '{ print $1 }'`
13+
elif [ "$os" = "Darwin" ]; then
14+
checksum=`cat $MODEL | md5`
15+
fi
16+
if [ "$checksum" = "$CHECKSUM" ]; then
17+
echo "Model checksum is correct. No need to download."
18+
exit 0
19+
else
20+
echo "Model checksum is incorrect. Need to download again."
21+
fi
22+
fi
23+
24+
echo "Downloading..."
25+
26+
wget --no-check-certificate https://www.dropbox.com/s/0i3etlgmsmgf5ei/$MODEL
27+
echo "Done. Please run this command again to verify that checksum = $CHECKSUM."
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
name: "R-CNN-ilsvrc13"
2+
input: "data"
3+
input_dim: 10
4+
input_dim: 3
5+
input_dim: 227
6+
input_dim: 227
7+
layers {
8+
name: "conv1"
9+
type: CONVOLUTION
10+
bottom: "data"
11+
top: "conv1"
12+
convolution_param {
13+
num_output: 96
14+
kernel_size: 11
15+
stride: 4
16+
}
17+
}
18+
layers {
19+
name: "relu1"
20+
type: RELU
21+
bottom: "conv1"
22+
top: "conv1"
23+
}
24+
layers {
25+
name: "pool1"
26+
type: POOLING
27+
bottom: "conv1"
28+
top: "pool1"
29+
pooling_param {
30+
pool: MAX
31+
kernel_size: 3
32+
stride: 2
33+
}
34+
}
35+
layers {
36+
name: "norm1"
37+
type: LRN
38+
bottom: "pool1"
39+
top: "norm1"
40+
lrn_param {
41+
local_size: 5
42+
alpha: 0.0001
43+
beta: 0.75
44+
}
45+
}
46+
layers {
47+
name: "conv2"
48+
type: CONVOLUTION
49+
bottom: "norm1"
50+
top: "conv2"
51+
convolution_param {
52+
num_output: 256
53+
pad: 2
54+
kernel_size: 5
55+
group: 2
56+
}
57+
}
58+
layers {
59+
name: "relu2"
60+
type: RELU
61+
bottom: "conv2"
62+
top: "conv2"
63+
}
64+
layers {
65+
name: "pool2"
66+
type: POOLING
67+
bottom: "conv2"
68+
top: "pool2"
69+
pooling_param {
70+
pool: MAX
71+
kernel_size: 3
72+
stride: 2
73+
}
74+
}
75+
layers {
76+
name: "norm2"
77+
type: LRN
78+
bottom: "pool2"
79+
top: "norm2"
80+
lrn_param {
81+
local_size: 5
82+
alpha: 0.0001
83+
beta: 0.75
84+
}
85+
}
86+
layers {
87+
name: "conv3"
88+
type: CONVOLUTION
89+
bottom: "norm2"
90+
top: "conv3"
91+
convolution_param {
92+
num_output: 384
93+
pad: 1
94+
kernel_size: 3
95+
}
96+
}
97+
layers {
98+
name: "relu3"
99+
type: RELU
100+
bottom: "conv3"
101+
top: "conv3"
102+
}
103+
layers {
104+
name: "conv4"
105+
type: CONVOLUTION
106+
bottom: "conv3"
107+
top: "conv4"
108+
convolution_param {
109+
num_output: 384
110+
pad: 1
111+
kernel_size: 3
112+
group: 2
113+
}
114+
}
115+
layers {
116+
name: "relu4"
117+
type: RELU
118+
bottom: "conv4"
119+
top: "conv4"
120+
}
121+
layers {
122+
name: "conv5"
123+
type: CONVOLUTION
124+
bottom: "conv4"
125+
top: "conv5"
126+
convolution_param {
127+
num_output: 256
128+
pad: 1
129+
kernel_size: 3
130+
group: 2
131+
}
132+
}
133+
layers {
134+
name: "relu5"
135+
type: RELU
136+
bottom: "conv5"
137+
top: "conv5"
138+
}
139+
layers {
140+
name: "pool5"
141+
type: POOLING
142+
bottom: "conv5"
143+
top: "pool5"
144+
pooling_param {
145+
pool: MAX
146+
kernel_size: 3
147+
stride: 2
148+
}
149+
}
150+
layers {
151+
name: "fc6"
152+
type: INNER_PRODUCT
153+
bottom: "pool5"
154+
top: "fc6"
155+
inner_product_param {
156+
num_output: 4096
157+
}
158+
}
159+
layers {
160+
name: "relu6"
161+
type: RELU
162+
bottom: "fc6"
163+
top: "fc6"
164+
}
165+
layers {
166+
name: "drop6"
167+
type: DROPOUT
168+
bottom: "fc6"
169+
top: "fc6"
170+
dropout_param {
171+
dropout_ratio: 0.5
172+
}
173+
}
174+
layers {
175+
name: "fc7"
176+
type: INNER_PRODUCT
177+
bottom: "fc6"
178+
top: "fc7"
179+
inner_product_param {
180+
num_output: 4096
181+
}
182+
}
183+
layers {
184+
name: "relu7"
185+
type: RELU
186+
bottom: "fc7"
187+
top: "fc7"
188+
}
189+
layers {
190+
name: "drop7"
191+
type: DROPOUT
192+
bottom: "fc7"
193+
top: "fc7"
194+
dropout_param {
195+
dropout_ratio: 0.5
196+
}
197+
}
198+
# R-CNN classification layer made from R-CNN ILSVRC13 SVMs.
199+
layers {
200+
name: "fc-rcnn"
201+
type: INNER_PRODUCT
202+
bottom: "fc7"
203+
top: "fc-rcnn"
204+
inner_product_param {
205+
num_output: 200
206+
}
207+
}

examples/images/fish-bike.jpg

46.5 KB
Loading

python/caffe/detector.py

+88-8
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@
1212
The selective_search_ijcv_with_python code required for the selective search
1313
proposal mode is available at
1414
https://github.com/sergeyk/selective_search_ijcv_with_python
15-
16-
TODO
17-
- R-CNN crop mode / crop with context.
18-
- Bundle with R-CNN model for example.
1915
"""
2016
import numpy as np
2117
import os
@@ -29,11 +25,14 @@ class Detector(caffe.Net):
2925
selective search proposals.
3026
"""
3127
def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
32-
input_scale=None, channel_swap=None):
28+
input_scale=None, channel_swap=None, context_pad=None):
3329
"""
3430
Take
3531
gpu, mean_file, input_scale, channel_swap: convenience params for
3632
setting mode, mean, input scale, and channel order.
33+
context_pad: amount of surrounding context to take s.t. a `context_pad`
34+
sized border of pixels in the network input image is context, as in
35+
R-CNN feature extraction.
3736
"""
3837
caffe.Net.__init__(self, model_file, pretrained_file)
3938
self.set_phase_test()
@@ -50,6 +49,8 @@ def __init__(self, model_file, pretrained_file, gpu=False, mean_file=None,
5049
if channel_swap:
5150
self.set_channel_swap(self.inputs[0], channel_swap)
5251

52+
self.configure_crop(context_pad)
53+
5354

5455
def detect_windows(self, images_windows):
5556
"""
@@ -58,6 +59,7 @@ def detect_windows(self, images_windows):
5859
5960
Take
6061
images_windows: (image filename, window list) iterable.
62+
context_crop: size of context border to crop in pixels.
6163
6264
Give
6365
detections: list of {filename: image filename, window: crop coordinates,
@@ -68,8 +70,7 @@ def detect_windows(self, images_windows):
6870
for image_fname, windows in images_windows:
6971
image = caffe.io.load_image(image_fname).astype(np.float32)
7072
for window in windows:
71-
window_inputs.append(image[window[0]:window[2],
72-
window[1]:window[3]])
73+
window_inputs.append(self.crop(image, window))
7374

7475
# Run through the net (warping windows to input dimensions).
7576
caffe_in = np.asarray([self.preprocess(self.inputs[0], window_in)
@@ -106,6 +107,85 @@ def detect_selective_search(self, image_fnames):
106107
import selective_search_ijcv_with_python as selective_search
107108
# Make absolute paths so MATLAB can find the files.
108109
image_fnames = [os.path.abspath(f) for f in image_fnames]
109-
windows_list = selective_search.get_windows(image_fnames)
110+
windows_list = selective_search.get_windows(
111+
image_fnames,
112+
cmd='selective_search_rcnn'
113+
)
110114
# Run windowed detection on the selective search list.
111115
return self.detect_windows(zip(image_fnames, windows_list))
116+
117+
118+
def crop(self, im, window):
119+
"""
120+
Crop a window from the image for detection. Include surrounding context
121+
according to the `context_pad` configuration.
122+
123+
Take
124+
im: H x W x K image ndarray to crop.
125+
window: bounding box coordinates as ymin, xmin, ymax, xmax.
126+
127+
Give
128+
crop: cropped window.
129+
"""
130+
# Crop window from the image.
131+
crop = im[window[0]:window[2], window[1]:window[3]]
132+
133+
if self.context_pad:
134+
box = window.copy()
135+
crop_size = self.blobs[self.inputs[0]].width # assumes square
136+
scale = crop_size / (1. * crop_size - self.context_pad * 2)
137+
# Crop a box + surrounding context.
138+
half_h = (box[2] - box[0] + 1) / 2.
139+
half_w = (box[3] - box[1] + 1) / 2.
140+
center = (box[0] + half_h, box[1] + half_w)
141+
scaled_dims = scale * np.array((-half_h, -half_w, half_h, half_w))
142+
box = np.round(np.tile(center, 2) + scaled_dims)
143+
full_h = box[2] - box[0] + 1
144+
full_w = box[3] - box[1] + 1
145+
scale_h = crop_size / full_h
146+
scale_w = crop_size / full_w
147+
pad_y = round(max(0, -box[0]) * scale_h) # amount out-of-bounds
148+
pad_x = round(max(0, -box[1]) * scale_w)
149+
150+
# Clip box to image dimensions.
151+
im_h, im_w = im.shape[:2]
152+
box = np.clip(box, 0., [im_h, im_w, im_h, im_w])
153+
clip_h = box[2] - box[0] + 1
154+
clip_w = box[3] - box[1] + 1
155+
assert(clip_h > 0 and clip_w > 0)
156+
crop_h = round(clip_h * scale_h)
157+
crop_w = round(clip_w * scale_w)
158+
if pad_y + crop_h > crop_size:
159+
crop_h = crop_size - pad_y
160+
if pad_x + crop_w > crop_size:
161+
crop_w = crop_size - pad_x
162+
163+
# collect with context padding and place in input
164+
# with mean padding
165+
context_crop = im[box[0]:box[2], box[1]:box[3]]
166+
context_crop = caffe.io.resize_image(context_crop, (crop_h, crop_w))
167+
crop = self.crop_mean.copy()
168+
crop[pad_y:(pad_y + crop_h), pad_x:(pad_x + crop_w)] = context_crop
169+
170+
return crop
171+
172+
173+
def configure_crop(self, context_pad):
174+
"""
175+
Configure amount of context for cropping.
176+
If context is included, make the special input mean for context padding.
177+
178+
Take
179+
context_pad: amount of context for cropping.
180+
"""
181+
self.context_pad = context_pad
182+
if self.context_pad:
183+
input_scale = self.input_scale.get(self.inputs[0])
184+
channel_order = self.channel_swap.get(self.inputs[0])
185+
# Padding context crops needs the mean in unprocessed input space.
186+
self.crop_mean = self.mean[self.inputs[0]].copy()
187+
self.crop_mean = self.crop_mean.transpose((1,2,0))
188+
channel_order_inverse = [channel_order.index(i)
189+
for i in range(self.crop_mean.shape[2])]
190+
self.crop_mean = self.crop_mean[:,:, channel_order_inverse]
191+
self.crop_mean /= input_scale

0 commit comments

Comments
 (0)