Skip to content


Separating the image nearest-neighbor convolution nn from semantic on…
Browse files Browse the repository at this point in the history
…e mm. Both results are multiplied now rather than the previous addition. Normalized cross-correlation sometimes affects patch selection diversiny, and hence quality. This may also be causing the desaturation issues and lower quality than gram-based approaches too.
  • Loading branch information
alexjc committed Mar 25, 2016
1 parent 3bf9e72 commit b151858
Showing 1 changed file with 40 additions and 25 deletions.
65 changes: 40 additions & 25 deletions
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def setup_model(self):

# First network for the main image. These are convolution only, and stop at layer 4_2 (rest unused).
net['img'] = InputLayer((1, 3, None, None))
net['conv1_1'] = ConvLayer(net['img'], 64, 3, pad=1)
net['conv1_1'] = ConvLayer(net['img'], 64, 3, pad=1)
net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1)
net['pool1'] = PoolLayer(net['conv1_2'], 2, mode='average_exc_pad')
net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1)
Expand All @@ -124,17 +124,17 @@ def setup_model(self):

# Second network for the semantic layers. This dynamically downsamples the map and concatenates it.
net['map'] = InputLayer((1, 3, None, None))
net['map_2'] = PoolLayer(net['map'], 2, mode='average_exc_pad')
net['map_3'] = PoolLayer(net['map'], 4, mode='average_exc_pad')
net['map_4'] = PoolLayer(net['map'], 8, mode='average_exc_pad')

net['sem2_1'] = ConcatLayer([net['conv2_1'], net['map_2']])
net['sem3_1'] = ConcatLayer([net['conv3_1'], net['map_3']])
net['sem4_1'] = ConcatLayer([net['conv4_1'], net['map_4']])
net['map2_1'] = PoolLayer(net['map'], 2, mode='average_exc_pad')
net['map3_1'] = PoolLayer(net['map'], 4, mode='average_exc_pad')
net['map4_1'] = PoolLayer(net['map'], 8, mode='average_exc_pad')

# Third network for the nearest neighbors; it's a default size for now, updated once we know more.
net['nn3_1'] = ConvLayer(net['sem3_1'], 1, 3, b=None, pad=0)
net['nn4_1'] = ConvLayer(net['sem4_1'], 1, 3, b=None, pad=0)
net['nn2_1'] = ConvLayer(net['conv2_1'], 1, 3, b=None, pad=0)
net['mm2_1'] = ConvLayer(net['map2_1'], 1, 3, b=None, pad=0)
net['nn3_1'] = ConvLayer(net['conv3_1'], 1, 3, b=None, pad=0)
net['mm3_1'] = ConvLayer(net['map3_1'], 1, 3, b=None, pad=0)
net['nn4_1'] = ConvLayer(net['conv4_1'], 1, 3, b=None, pad=0)
net['mm4_1'] = ConvLayer(net['map4_1'], 1, 3, b=None, pad=0) = net

Expand Down Expand Up @@ -299,19 +299,29 @@ def prepare_style(self, scale=1.0):
flags = {}

# Compile a function to run on the GPU to extract patches for all layers at once.
required_layers = ['conv'+l for l in self.style_layers] + ['map'+l for l in self.style_layers]
extractor = theano.function(
[self.model.tensor_img, self.model.tensor_map],
self.extract_patches([self.model.tensor_outputs['sem'+l] for l in self.style_layers]),
self.extract_patches([self.model.tensor_outputs[l] for l in required_layers]),
result = extractor(self.style_image, self.style_map)

# For each layer, we now have a set of patches and their magnitude.
for layer, patches, norms in zip(self.style_layers, result[::2], result[1::2]):
l =['nn'+layer]
# For each layer, build it from set of patches and their magnitude.
def build(layer, prefix, name, patches, norms):
l =[prefix+layer]
l.N = theano.shared(norms)
l.num_filters = patches.shape[0]
print(' - Style layer sem{}: {} patches in {:,}kb.'.format(layer, patches.shape[0], patches.size//1000))
print(' - {} layer {}: {} patches in {:,}kb.'.format(name, layer, patches.shape[0], patches.size//1000))

result_nn = result[:len(self.style_layers)*2]
for layer, *data in zip(self.style_layers, result_nn[::2], result_nn[1::2]):
build(layer, 'nn', 'Style', *data)

result_mm = result[len(self.style_layers)*2:]
for layer, *data in zip(self.style_layers, result_mm[::2], result_mm[1::2]):
build(layer, 'mm', 'Semantic', *data)

def extract_patches(self, layers, size=3, stride=1):
"""This function builds a Theano expression that will get compiled an run on the GPU. It extracts 3x3 patches
Expand All @@ -322,10 +332,10 @@ def extract_patches(self, layers, size=3, stride=1):
# Use a Theano helper function to extract "neighbors" of specific size, seems a bit slower than doing
# it manually but much simpler!
patches = theano.tensor.nnet.neighbours.images2neibs(f, (size, size), (stride, stride), mode='valid')

# Make sure the patches are in the shape required to insert them into the model as another layer.
patches = patches.reshape((-1, patches.shape[0] // f.shape[1], size, size)).dimshuffle((1, 0, 2, 3))

# Calcualte the magnitude that we'll use for normalization at runtime, then store...
norm = T.sqrt(T.sum(patches ** 2.0, axis=(1,2,3), keepdims=True))
results.extend([patches[:,:,::-1,::-1], norm])
Expand Down Expand Up @@ -380,7 +390,7 @@ def style_loss(self):
return style_loss

# Extract the patches from the current image, as well as their magnitude.
result = self.extract_patches([self.model.tensor_outputs['sem'+l] for l in self.style_layers])
result = self.extract_patches([self.model.tensor_outputs['conv'+l] for l in self.style_layers])

# Multiple style layers are optimized separately, usually sem3_1 and sem4_1.
for l, patches, norms in zip(self.style_layers, result[::2], result[1::2]):
Expand All @@ -390,13 +400,15 @@ def style_loss(self):
dist = self.model.tensor_outputs['nn'+l]
dist = dist.reshape((dist.shape[1], -1)) / norms.reshape((1,-1)) / layer.N.reshape((-1,1))

sem = self.model.tensor_outputs['mm'+l]
sem = sem.reshape((sem.shape[1], -1))

# Pick the best style patches for each patch in the current image, the result is an array of indices.
best = dist.argmax(axis=0)
best = (dist * sem).argmax(axis=0)

# Compute the mean squared error between the current patch and the best matching style patch.
# Ignore the last channels (from semantic map) so errors returned are indicative of image only.
channels = self.style_map_original.shape[2]
loss = T.mean((patches[:,:-channels] - layer.W[best,:-channels]) ** 2.0)
loss = T.mean((patches - layer.W[best]) ** 2.0)
style_loss.append(('style', l, args.style_weight * loss))

return style_loss
Expand All @@ -420,7 +432,7 @@ def evaluate(self, Xn):
# Adjust the representation to be compatible with the model before computing results.
current_img = Xn.reshape(self.content_image.shape).astype(np.float32) - self.model.pixel_mean
grads, *losses = self.compute_grad_and_losses(current_img, self.content_map)

if np.isnan(grads).any():
raise OverflowError("Optimization diverged; try using different device or parameters.")

Expand Down Expand Up @@ -467,13 +479,16 @@ def run(self):
.format(ansi.BLUE_B, i, int(shape[1]*scale), int(shape[0]*scale), scale, ansi.BLUE))

# Precompute all necessary data for the various layers, put patches in place into augmented network.
self.model.setup(layers=['sem'+l for l in self.style_layers] + ['conv'+l for l in self.content_layers])
self.model.setup(layers=['conv'+l for l in self.style_layers] +
['map'+l for l in self.style_layers] +
['conv'+l for l in self.content_layers])

# Now setup the model with the new data, ready for the optimization loop.
self.model.setup(layers=['sem'+l for l in self.style_layers] +
self.model.setup(layers=['conv'+l for l in self.style_layers] +
['nn'+l for l in self.style_layers] +
['mm'+l for l in self.style_layers] +
['conv'+l for l in self.content_layers])
Expand Down

0 comments on commit b151858

Please sign in to comment.