From 522b0b1fe3f828ff128f33077a1423ca2c999ae4 Mon Sep 17 00:00:00 2001 From: zsdonghao Date: Fri, 18 Nov 2016 15:50:57 +0000 Subject: [PATCH] update gitignore --- .gitignore | 4 +- train_txt2im.py | 688 ------------------------------------------------ 2 files changed, 3 insertions(+), 689 deletions(-) delete mode 100755 train_txt2im.py diff --git a/.gitignore b/.gitignore index 1abed302..a4467bb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,12 @@ checkpoint/* 102flowers/* cub_birds_200_2011/* -tensorlayer/* +samples/main* +backup/ *.txt *.png *.pdf *.pyc .DS_Store ._* +train_txt2im_wrong_image.py diff --git a/train_txt2im.py b/train_txt2im.py deleted file mode 100755 index ebd5f222..00000000 --- a/train_txt2im.py +++ /dev/null @@ -1,688 +0,0 @@ -#! /usr/bin/python -# -*- coding: utf8 -*- - - - -import tensorflow as tf -import tensorlayer as tl -from tensorlayer.layers import * -from tensorlayer.prepro import * -import numpy as np -import scipy -from scipy.io import loadmat -import time -import os -import re -import nltk -import random - -from utils import * - - -""" Generative Adversarial Text to Image Synthesis - -Downlaod Oxford 102 flowers dataset and caption -------------------------------------------------- -Flowers : http://www.robots.ox.ac.uk/%7Evgg/data/flowers/102/ - paste it in 102flowers/102flowers/*jpg -Captions : https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view - paste it in 102flowers/text_c10/class_* - -Code References ---------------- -- GAN-CLS by TensorFlow -- https://github.com/paarthneekhara/text-to-image/blob/master/train.py -- https://github.com/paarthneekhara/text-to-image/blob/master/model.py -- https://github.com/paarthneekhara/text-to-image/blob/master/Utils/ops.py -""" -###======================== PREPARE DATA ====================================### -## Directory of Oxford 102 flowers dataset -if True: - """ - images.shape = [8000, 64, 64, 3] - captions_ids = [80000, any] - """ - cwd = os.getcwd() - img_dir = os.path.join(cwd, '102flowers/102flowers') - caption_dir = os.path.join(cwd, '102flowers/text_c10') - VOC_FIR = cwd + '/vocab.txt' - - ## load captions - caption_sub_dir = load_folder_list( caption_dir ) - captions_dict = {} - processed_capts = [] - for sub_dir in caption_sub_dir: # get caption file list - with tl.ops.suppress_stdout(): - files = tl.files.load_file_list(path=sub_dir, regx='^image_[0-9]+\.txt') - for i, f in enumerate(files): - file_dir = os.path.join(sub_dir, f) - key = int(re.findall('\d+', f)[0]) - t = open(file_dir,'r') - lines = [] - for line in t: - lines.append(line.rstrip()) # remove \n - processed_capts.append(tl.nlp.process_sentence(line.rstrip(), start_word="", end_word="")) - assert len(lines) == 10, "Every flower image have 10 captions" - captions_dict[key] = lines - print(" * %d x %d captions found " % (len(captions_dict), len(lines))) - - ## build vocab - _ = tl.nlp.create_vocab(processed_capts, word_counts_output_file=VOC_FIR, min_word_count=1) - vocab = tl.nlp.Vocabulary(VOC_FIR, start_word="", end_word="", unk_word="") - - ## store all captions ids in list - captions_ids = [] - for key, value in captions_dict.iteritems(): - for v in value: - captions_ids.append( [vocab.word_to_id(word) for word in nltk.tokenize.word_tokenize(v)] ) - # print(v) # prominent purple stigma,petals are white inc olor - # print(captions_ids) # [[152, 19, 33, 15, 3, 8, 14, 719, 723]] - # exit() - captions_ids = np.asarray(captions_ids) - print(" * tokenized %d captions" % len(captions_ids)) - - ## check - img_capt = captions_dict[1][1] - print("img_capt: %s" % img_capt) - print("nltk.tokenize.word_tokenize(img_capt): %s" % nltk.tokenize.word_tokenize(img_capt)) - img_capt_ids = [vocab.word_to_id(word) for word in nltk.tokenize.word_tokenize(img_capt)]#img_capt.split(' ')] - print("img_capt_ids: %s" % img_capt_ids) - print("id_to_word: %s" % [vocab.id_to_word(id) for id in img_capt_ids]) - - ## load images - with tl.ops.suppress_stdout(): # get image files list - imgs_title_list = sorted(tl.files.load_file_list(path=img_dir, regx='^image_[0-9]+\.jpg')) - print(" * %d images found, start loading and resizing ..." % len(imgs_title_list)) - s = time.time() - images = [] - for name in imgs_title_list: - img = scipy.misc.imread( os.path.join(img_dir, name) ) - img = tl.prepro.imresize(img, size=[64, 64]) # (64, 64, 3) - img = img.astype(np.float32) - images.append(img) - images = np.asarray(images) - print(" * loading and resizing took %ss" % (time.time()-s)) - - n_images = len(captions_dict) - n_captions = len(captions_ids) - n_captions_per_image = len(lines) # 10 - - print("n_captions: %d n_images: %d n_captions_per_image: %d" % (n_captions, n_images, n_captions_per_image)) - - captions_ids_train, captions_ids_test = captions_ids[: 8000*n_captions_per_image], captions_ids[8000*n_captions_per_image :] - images_train, images_test = images[:8000], images[8000:] - n_images_train = len(images_train) - n_images_test = len(images_test) - n_captions_train = len(captions_ids_train) - n_captions_test = len(captions_ids_test) - print("n_images_train:%d n_captions_train:%d" % (n_images_train, n_captions_train)) - print("n_images_test:%d n_captions_test:%d" % (n_images_test, n_captions_test)) - - ## check test image - # idexs = get_random_int(min=0, max=n_captions_test-1, number=64) - # temp_test_capt = captions_ids_test[idexs] - # for idx, ids in enumerate(temp_test_capt): - # print("%d %s" % (idx, [vocab.id_to_word(id) for id in ids])) - # temp_test_img = images_train[np.floor(np.asarray(idexs).astype('float')/n_captions_per_image).astype('int')] - # save_images(temp_test_img, [8, 8], 'temp_test_img.png') - # exit() - - # ## check the first example - # tl.visualize.frame(I=images[0], second=5, saveable=True, name='temp', cmap=None) - # for cap in captions_dict[1]: - # print(cap) - # print(captions_ids[0:10]) - # for ids in captions_ids[0:10]: - # print([vocab.id_to_word(id) for id in ids]) - # print_dict(captions_dict) - -###======================== DEFIINE MODEL ===================================### -batch_size = 64 -vocab_size = 8000 -word_embedding_size = 512 # paper said 1024 char-CNN-RNN -keep_prob = 0.5 -z_dim = 100 # Noise dimension -t_dim = 128 * 2 # Text feature dimension # paper said 128 -image_size = 64 # 64 x 64 -c_dim = 3 # for rgb -gf_dim = 64 # Number of conv in the first layer generator 64 -df_dim = 64 # Number of conv in the first layer discriminator 64 -# gfc_dim = 1024 # Dimension of gen untis for for fully connected layer 1024 -# caption_vector_length = 2400 # Caption Vector Length 2400 Hao : I use word-based dynamic_rnn - -# ## generate a random batch -# idexs = get_random_int(0, n_captions, batch_size) -# idexs = [i for i in range(0,100)] -# print(idexs) -# b_seqs = captions_ids[idexs] -# b_images = images[np.floor(np.asarray(idexs).astype('float')/n_captions_per_image).astype('int')] -# print("before padding %s" % b_seqs) -# b_seqs = tl.prepro.pad_sequences(b_seqs, padding='post') -# print("after padding %s" % b_seqs) -# # print(input_images.shape) # (64, 64, 64, 3) -# for ids in b_seqs: -# print([vocab.id_to_word(id) for id in ids]) -# print(np.max(b_images), np.min(b_images), b_images.shape) -# tl.visualize.images2d(b_images, second=5, saveable=True, name='temp2') -# exit() - -# save_images(images[:64], [8, 8], 'temp.png') -# flip_img = threading_data(images[:64], prepro_img, mode='debug') -# save_images(flip_img, [8, 8], 'temp2.png') -# exit() - -from tensorlayer.prepro import * -def prepro_img(x, mode=None): - if mode=='train': # [0, 255] --> (-1, 1), random flip left and right - x = x / (255. / 2.) - x = x - 1. - # x = flip_axis(x, axis=1, is_random=True) - elif mode=='rescale': # (-1, 1) --> (0, 1) - x = (x + 1.) / 2. - elif mode=='debug': - x = flip_axis(x, axis=1, is_random=True) - else: - raise Exception("Not support : %s" % mode) - return x - -def rnn_embed(input_seqs, is_train, reuse): - """MY IMPLEMENTATION, same weights for the Word Embedding and RNN in the discriminator and generator. - """ - w_init = tf.random_normal_initializer(stddev=0.02) - # w_init = tf.constant_initializer(value=0.0) - with tf.variable_scope("rnn", reuse=reuse): - tl.layers.set_name_reuse(reuse) - network = EmbeddingInputlayer( - inputs = input_seqs, - vocabulary_size = vocab_size, - embedding_size = word_embedding_size, - E_init = w_init, - name = 'wordembed') - network = DynamicRNNLayer(network, - cell_fn = tf.nn.rnn_cell.LSTMCell, - n_hidden = word_embedding_size, - dropout = (keep_prob if is_train else None), - initializer = w_init, - sequence_length = tl.layers.retrieve_seq_length_op2(input_seqs), - return_last = True, - name = 'dynamic') - # paper 4.1: reduce the dim of description embedding in (seperate) FC layer followed by rectification - # network = DenseLayer(network, n_units=t_dim, - # act=lambda x: tl.act.lrelu(x, 0.2), W_init=w_init, name='reduce_txt/dense') - return network - -def generator_txt2img(input_z, net_rnn_embed=None, is_train=True, reuse=False): - # IMPLEMENTATION based on : https://github.com/paarthneekhara/text-to-image/blob/master/model.py - s = image_size - s2, s4, s8, s16 = int(s/2), int(s/4), int(s/8), int(s/16) - - w_init = tf.random_normal_initializer(stddev=0.02) - gamma_init = tf.random_normal_initializer(1., 0.02) - - with tf.variable_scope("generator", reuse=reuse): - tl.layers.set_name_reuse(reuse) - net_in = InputLayer(input_z, name='g_inputz') - - if net_rnn_embed is not None: - # paper 4.1 : the discription embedding is first compressed using a FC layer to small dim (128), followed by leaky-Relu - net_reduced_text = DenseLayer(net_rnn_embed, n_units=t_dim, - act=lambda x: tl.act.lrelu(x, 0.2), - W_init = w_init, name='g_reduce_text/dense') - # paper 4.1 : and then concatenated to the noise vector z - net_in = ConcatLayer([net_in, net_reduced_text], concat_dim=1, name='g_concat_z_seq') - else: - print("No text info will be used, i.e. normal DCGAN") - - net_h0 = DenseLayer(net_in, gf_dim*8*s16*s16, act=tf.identity, - W_init=w_init, name='g_h0/dense') # (64, 8192) - net_h0 = ReshapeLayer(net_h0, [-1, s16, s16, gf_dim*8], name='g_h0/reshape') - net_h0 = BatchNormLayer(net_h0, act=tf.nn.relu, is_train=is_train, - gamma_init=gamma_init, name='g_h0/batch_norm') - - net_h1 = DeConv2d(net_h0, gf_dim*4, (5, 5), out_size=(s8, s8), strides=(2, 2), - padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g_h1/decon2d') - net_h1 = BatchNormLayer(net_h1, act=tf.nn.relu, is_train=is_train, - gamma_init=gamma_init, name='g_h1/batch_norm') - - net_h2 = DeConv2d(net_h1, gf_dim*2, (5, 5), out_size=(s4, s4), strides=(2, 2), - padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g_h2/decon2d') - net_h2 = BatchNormLayer(net_h2, act=tf.nn.relu, is_train=is_train, - gamma_init=gamma_init, name='g_h2/batch_norm') - - net_h3 = DeConv2d(net_h2, gf_dim, (5, 5), out_size=(s2, s2), strides=(2, 2), - padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g_h3/decon2d') - net_h3 = BatchNormLayer(net_h3, act=tf.nn.relu, is_train=is_train, - gamma_init=gamma_init, name='g_h3/batch_norm') - - net_h4 = DeConv2d(net_h3, c_dim, (5, 5), out_size=(s, s), strides=(2, 2), - padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g_h4/decon2d') - logits = net_h4.outputs - # net_h4.outputs = tf.nn.sigmoid(net_h4.outputs) # DCGAN uses tanh - net_h4.outputs = tf.nn.tanh(net_h4.outputs) - return net_h4, logits - -def discriminator_txt2img(input_images, net_rnn_embed=None, is_train=True, reuse=False): - # IMPLEMENTATION based on : https://github.com/paarthneekhara/text-to-image/blob/master/model.py - w_init = tf.random_normal_initializer(stddev=0.02) - gamma_init=tf.random_normal_initializer(1., 0.02) - - with tf.variable_scope("discriminator", reuse=reuse): - tl.layers.set_name_reuse(reuse) - - net_in = InputLayer(input_images, name='d_input/images') - net_h0 = Conv2d(net_in, df_dim, (5, 5), (2, 2), act=lambda x: tl.act.lrelu(x, 0.2), - padding='SAME', W_init=w_init, name='d_h0/conv2d') # (64, 32, 32, 64) - - net_h1 = Conv2d(net_h0, df_dim*2, (5, 5), (2, 2), act=None, - padding='SAME', W_init=w_init, name='d_h1/conv2d') - net_h1 = BatchNormLayer(net_h1, act=lambda x: tl.act.lrelu(x, 0.2), - is_train=is_train, gamma_init=gamma_init, name='d_h1/batchnorm') # (64, 16, 16, 128) - - net_h2 = Conv2d(net_h1, df_dim*4, (5, 5), (2, 2), act=None, - padding='SAME', W_init=w_init, name='d_h2/conv2d') - net_h2 = BatchNormLayer(net_h2, act=lambda x: tl.act.lrelu(x, 0.2), - is_train=is_train, gamma_init=gamma_init, name='d_h2/batchnorm') # (64, 8, 8, 256) - - net_h3 = Conv2d(net_h2, df_dim*8, (5, 5), (2, 2), act=None, - padding='SAME', W_init=w_init, name='d_h3/conv2d') - net_h3 = BatchNormLayer(net_h3, act=lambda x: tl.act.lrelu(x, 0.2), - is_train=is_train, gamma_init=gamma_init, name='d_h3/batchnorm') # (64, 4, 4, 512) paper 4.1: when the spatial dim of the D is 4x4, we replicate the description embedding spatially and perform a depth concatenation - - if net_rnn_embed is not None: - # paper : reduce the dim of description embedding in (seperate) FC layer followed by rectification - net_reduced_text = DenseLayer(net_rnn_embed, n_units=t_dim, - act=lambda x: tl.act.lrelu(x, 0.2), - W_init=w_init, name='d_reduce_txt/dense') - # net_reduced_text = net_rnn_embed # if reduce_txt in rnn_embed - net_reduced_text.outputs = tf.expand_dims(net_reduced_text.outputs, 1) - net_reduced_text.outputs = tf.expand_dims(net_reduced_text.outputs, 2) - net_reduced_text.outputs = tf.tile(net_reduced_text.outputs, [1, 4, 4, 1], name='d_tiled_embeddings') - - net_h3_concat = ConcatLayer([net_h3, net_reduced_text], concat_dim=3, name='d_h3_concat') # (64, 4, 4, 640) - # net_h3_concat = net_h3 # no text info - net_h3 = Conv2d(net_h3_concat, df_dim*8, (1, 1), (1, 1), padding='SAME', W_init=w_init, name='d_h3/conv2d_2') # paper 4.1: perform 1x1 conv followed by rectification and a 4x4 conv to compute the final score from D - net_h3 = BatchNormLayer(net_h3, act=lambda x: tl.act.lrelu(x, 0.2), - is_train=is_train, gamma_init=gamma_init, name='d_h3/batch_norm_2') # (64, 4, 4, 512) - else: - print("No text info will be used, i.e. normal DCGAN") - - net_h4 = FlattenLayer(net_h3, name='d_h4/flatten') # (64, 8192) - net_h4 = DenseLayer(net_h4, n_units=1, act=tf.identity, - W_init = w_init, name='d_h4/dense') - logits = net_h4.outputs - net_h4.outputs = tf.nn.sigmoid(net_h4.outputs) # (64, 1) - return net_h4, logits - - -# def generator_simplified_api(inputs, net_rnn_embed=None, is_train=True, reuse=False): -# image_size = 64 -# s2, s4, s8, s16 = int(image_size/2), int(image_size/4), int(image_size/8), int(image_size/16) -# gf_dim = 64 # Dimension of gen filters in first conv layer. [64] -# c_dim = 3#FLAGS.c_dim # n_color 3 -# batch_size = 64#FLAGS.batch_size # 64 -# -# w_init = tf.random_normal_initializer(stddev=0.02) -# gamma_init = tf.random_normal_initializer(1., 0.02) -# -# with tf.variable_scope("generator", reuse=reuse): -# tl.layers.set_name_reuse(reuse) -# -# net_in = InputLayer(inputs, name='g/in') -# net_h0 = DenseLayer(net_in, n_units=gf_dim*8*s16*s16, W_init=w_init, -# act = tf.identity, name='g/h0/lin') -# net_h0 = ReshapeLayer(net_h0, shape=[-1, s16, s16, gf_dim*8], name='g/h0/reshape') -# net_h0 = BatchNormLayer(net_h0, act=tf.nn.relu, is_train=is_train, -# gamma_init=gamma_init, name='g/h0/batch_norm') -# -# net_h1 = DeConv2d(net_h0, gf_dim*4, (5, 5), out_size=(s8, s8), strides=(2, 2), -# padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g/h1/decon2d') -# net_h1 = BatchNormLayer(net_h1, act=tf.nn.relu, is_train=is_train, -# gamma_init=gamma_init, name='g/h1/batch_norm') -# -# net_h2 = DeConv2d(net_h1, gf_dim*2, (5, 5), out_size=(s4, s4), strides=(2, 2), -# padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g/h2/decon2d') -# net_h2 = BatchNormLayer(net_h2, act=tf.nn.relu, is_train=is_train, -# gamma_init=gamma_init, name='g/h2/batch_norm') -# -# net_h3 = DeConv2d(net_h2, gf_dim, (5, 5), out_size=(s2, s2), strides=(2, 2), -# padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g/h3/decon2d') -# net_h3 = BatchNormLayer(net_h3, act=tf.nn.relu, is_train=is_train, -# gamma_init=gamma_init, name='g/h3/batch_norm') -# -# net_h4 = DeConv2d(net_h3, c_dim, (5, 5), out_size=(image_size, image_size), strides=(2, 2), -# padding='SAME', batch_size=batch_size, act=None, W_init=w_init, name='g/h4/decon2d') -# logits = net_h4.outputs -# net_h4.outputs = tf.nn.tanh(net_h4.outputs) -# return net_h4, logits - -# def discriminator_simplified_api(inputs, net_rnn_embed=None, is_train=True, reuse=False): -# df_dim = 64 # Dimension of discrim filters in first conv layer. [64] -# c_dim = 3#FLAGS.c_dim # n_color 3 -# batch_size = 64#FLAGS.batch_size # 64 -# -# w_init = tf.random_normal_initializer(stddev=0.02) -# gamma_init = tf.random_normal_initializer(1., 0.02) -# -# with tf.variable_scope("discriminator", reuse=reuse): -# tl.layers.set_name_reuse(reuse) -# -# net_in = InputLayer(inputs, name='d/in') -# net_h0 = Conv2d(net_in, df_dim, (5, 5), (2, 2), act=lambda x: tl.act.lrelu(x, 0.2), -# padding='SAME', W_init=w_init, name='d/h0/conv2d') -# -# net_h1 = Conv2d(net_h0, df_dim*2, (5, 5), (2, 2), act=None, -# padding='SAME', W_init=w_init, name='d/h1/conv2d') -# net_h1 = BatchNormLayer(net_h1, act=lambda x: tl.act.lrelu(x, 0.2), -# is_train=is_train, gamma_init=gamma_init, name='d/h1/batch_norm') -# -# net_h2 = Conv2d(net_h1, df_dim*4, (5, 5), (2, 2), act=None, -# padding='SAME', W_init=w_init, name='d/h2/conv2d') -# net_h2 = BatchNormLayer(net_h2, act=lambda x: tl.act.lrelu(x, 0.2), -# is_train=is_train, gamma_init=gamma_init, name='d/h2/batch_norm') -# -# net_h3 = Conv2d(net_h2, df_dim*8, (5, 5), (2, 2), act=None, -# padding='SAME', W_init=w_init, name='d/h3/conv2d') -# net_h3 = BatchNormLayer(net_h3, act=lambda x: tl.act.lrelu(x, 0.2), -# is_train=is_train, gamma_init=gamma_init, name='d/h3/batch_norm') -# -# net_h4 = FlattenLayer(net_h3, name='d/h4/flatten') -# net_h4 = DenseLayer(net_h4, n_units=1, act=tf.identity, -# W_init = w_init, name='d/h4/lin_sigmoid') -# logits = net_h4.outputs -# net_h4.outputs = tf.nn.sigmoid(net_h4.outputs) -# return net_h4, logits -# with tf.device("/gpu:0"): -## -# https://github.com/paarthneekhara/text-to-image/blob/master/train.py -# https://github.com/paarthneekhara/text-to-image/blob/master/model.py -# https://github.com/paarthneekhara/text-to-image/blob/master/Utils/ops.py -## build_model - -t_real_image = tf.placeholder('float32', [batch_size, image_size, image_size, 3], name = 'real_image') -t_wrong_image = tf.placeholder('float32', [batch_size ,image_size, image_size, 3 ], name = 'wrong_image') # remove if DCGAN only -t_real_caption = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name='real_caption_input') # remove if DCGAN only -t_z = tf.placeholder(tf.float32, [batch_size, z_dim], name='z_noise') - -## training inference for training DCGAN -# from dcgan_model import * -# net_fake_image, _ = generator_dcgan(t_z, is_train=True, reuse=False) -# _, disc_fake_image_logits = discriminator_dcgan(net_fake_image.outputs, is_train=True, reuse=False) -# _, disc_real_image_logits = discriminator_dcgan(t_real_image, is_train=True, reuse=True) -## training inference for txt2img -# net_rnn = rnn_embed(t_real_caption, is_train=True, reuse=False) # remove if DCGAN only -net_rnn = rnn_embed(t_real_caption, is_train=False, reuse=False) # if pre-trained -net_fake_image, _ = generator_txt2img(t_z, - net_rnn, # remove if DCGAN only - is_train=True, reuse=False) -net_d, disc_fake_image_logits = discriminator_txt2img( - net_fake_image.outputs, - net_rnn, # remove if DCGAN only - is_train=True, reuse=False) -_, disc_real_image_logits = discriminator_txt2img( - t_real_image, - net_rnn, # remove if DCGAN only - is_train=True, reuse=True) -_, disc_wrong_image_logits = discriminator_txt2img( # CLS - t_wrong_image, # remove if DCGAN only - net_rnn, # remove if DCGAN only - is_train=True, reuse=True) # remove if DCGAN only - -## testing inference for DCGAN -# net_g, _ = generator_dcgan(t_z, is_train=False, reuse=True) -## testing inference for txt2img -net_g, _ = generator_txt2img(t_z, - rnn_embed(t_real_caption, is_train=False, reuse=True), # remove if DCGAN only - is_train=False, reuse=True) - -## loss for DCGAN -# d_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_real_image_logits, tf.ones_like(disc_real_image_logits))) # real == 1 -# d_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_fake_image_logits, tf.zeros_like(disc_fake_image_logits))) # fake == 0 -# d_loss = d_loss_real + d_loss_fake -# g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_fake_image_logits, tf.ones_like(disc_fake_image_logits))) - -## loss for txt2img -d_loss1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_real_image_logits, tf.ones_like(disc_real_image_logits))) -d_loss2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_wrong_image_logits, tf.zeros_like(disc_wrong_image_logits))) # remove if DCGAN only -d_loss3 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_fake_image_logits, tf.zeros_like(disc_fake_image_logits))) - -d_loss = d_loss1 + d_loss2 + d_loss3 - -g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_fake_image_logits, tf.ones_like(disc_fake_image_logits))) # real == 1, fake == 0 - -net_fake_image.print_params(False) -net_fake_image.print_layers() -# exit() - -####======================== DEFINE TRAIN OPTS ==========================### -## Cost real == 1, fake == 0 -lr = 0.0002 -beta1 = 0.5 -n_g_batch = 2 # update G, x time per batch -e_vars = tl.layers.get_variables_with_name('rnn', True, True) # remove if DCGAN only -d_vars = tl.layers.get_variables_with_name('discriminator', True, True) -g_vars = tl.layers.get_variables_with_name('generator', True, True) - -## results -# update rnn in both D and G: 700 epochs lr=2e-4 d_loss: 0.09966065, g_loss: 3.55960941 Hao : don't update RNN and G together, G is cheat D -# update rnn only in G: -# update rnn only in D: -# 1000 epochs lr=2e-4 beta1=0.5 d_loss: 0.00363965, g_loss: 10.59220123 (at the begining, it work, but finially g_loss increase, RNN overfit?) -# 500 epochs lr=1e-4 beta1=0.9 rnn 512 d_loss: 0.03315957, g_loss: 5.69525194 (at 100 epochs, it is correct ! but sometime turn to incorrect color.) -# rnn size 200 dp 0.9, don't work, all the same -# pre-trained RNN with D for 100 epoch, then don't update RNN anymore. -# 200 epochs lr=2e-4 beta=0.5 rnn 512 dp 0.5 d_loss: 0.40549859, g_loss: 6.31078005, g_loss still increase -# increase t_dim from 128 to 256 250 epoch d_loss: 0.01651493, g_loss: 11.65070343, g_loss still increase -# update G 3 times per batch 300 epoch d_loss: 0.35285434, g_loss: 4.30621910 g_loss still increase -# change lr to 2e-5 images under a caption look the same g_loss still increase -# l2=1e-4, update G 5 times images under a caption look the same -# set net_rnn_embed(is_train=False) , no random filp 100 epoch: good, 200 epoch: images under a caption look the same, g_loss still increase - -# grads = tf.gradients(d_loss, d_vars + e_vars) -# grads, _ = tf.clip_by_global_norm(tf.gradients(d_loss, d_vars + e_vars), 30) -# optimizer = tf.train.AdamOptimizer(1e-4, beta1=beta1) -# d_optim = optimizer.apply_gradients(zip(grads, d_vars + e_vars)) -# -# grads = tf.gradients(g_loss, g_vars) -# grads, _ = tf.clip_by_global_norm(tf.gradients(g_loss, g_vars), 30) -# optimizer = tf.train.AdamOptimizer(1e-4, beta1=beta1) -# g_optim = optimizer.apply_gradients(zip(grads, g_vars)) - -d_optim = tf.train.AdamOptimizer(lr, beta1=beta1).minimize(d_loss, var_list=d_vars)# + e_vars) -g_optim = tf.train.AdamOptimizer(lr, beta1=beta1).minimize(g_loss, var_list=g_vars) - -###============================ TRAINING ====================================### -sess = tf.InteractiveSession() -sess.run(tf.initialize_all_variables()) - -save_dir = "checkpoint" -if not os.path.exists(save_dir): - print("[!] Folder %s is not exist, create it." % save_dir) - os.mkdir(save_dir) -# load the latest checkpoints -net_e_name = os.path.join(save_dir, 'net_e.npz') -net_g_name = os.path.join(save_dir, 'net_g.npz') -net_d_name = os.path.join(save_dir, 'net_d.npz') -if not os.path.exists(net_e_name): - print("[!] Loading RNN checkpoints failed!") -else: - net_e_loaded_params = tl.files.load_npz(name=net_e_name) - tl.files.assign_params(sess, net_e_loaded_params, net_rnn) - print("[*] Loading RNN checkpoints SUCCESS!") - -# if not (os.path.exists(net_g_name) and os.path.exists(net_d_name)): -# print("[!] Loading G and D checkpoints failed!") -# else: -# net_g_loaded_params = tl.files.load_npz(name=net_g_name) -# net_d_loaded_params = tl.files.load_npz(name=net_d_name) -# tl.files.assign_params(sess, net_g_loaded_params, net_g) -# tl.files.assign_params(sess, net_d_loaded_params, net_d) -# print("[*] Loading G and D checkpoints SUCCESS!") - -# sess=tf.Session() -# tl.ops.set_gpu_fraction(sess=sess, gpu_fraction=0.998) -# sess.run(tf.initialize_all_variables()) - -## seed for generation, z and sentence ids -sample_size = batch_size -sample_seed = np.random.uniform(low=-1, high=1, size=(sample_size, z_dim)).astype(np.float32) # paper said [0, 1] -# sample_sentence = ["this white and yellow flower have thin white petals and a round yellow stamen", \ -# "the flower has petals that are bright pinkish purple with white stigma"] * 32 -# sample_sentence = ["these flowers have petals that start off white in color and end in a dark purple towards the tips"] * 32 + \ -# ["bright droopy yellow petals with burgundy streaks and a yellow stigma"] * 32 -sample_sentence = ["these white flowers have petals that start off white in color and end in a white towards the tips"] * 32 + \ - ["this yellow petals with burgundy streaks and a yellow stigma"] * 32 -# sample_sentence = captions_ids_test[0:sample_size] -for i, sentence in enumerate(sample_sentence): - sample_sentence[i] = [vocab.word_to_id(word) for word in nltk.tokenize.word_tokenize(sentence)] - # sample_sentence[i] = [vocab.word_to_id(word) for word in sentence] - print("seed: %s" % sentence) - # print(sample_sentence[i]) -sample_sentence = tl.prepro.pad_sequences(sample_sentence, padding='post') - -n_epoch = 2000 # 600 when pre-trained rnn -print_freq = 1 -n_batch_epoch = int(n_images / batch_size) -for epoch in range(n_epoch): - start_time = time.time() - for step in range(n_batch_epoch): - step_time = time.time() - ## get matched text - idexs = get_random_int(min=0, max=n_captions_train-1, number=batch_size) - b_real_caption = captions_ids_train[idexs] # remove if DCGAN only - b_real_caption = tl.prepro.pad_sequences(b_real_caption, padding='post') # matched text (64, any) # remove if DCGAN only - ## get real image - b_real_images = images_train[np.floor(np.asarray(idexs).astype('float')/n_captions_per_image).astype('int')] # real images (64, 64, 64, 3) - ## get wrong caption - # idexs = get_random_int(min=0, max=n_captions-1, number=batch_size) - # b_wrong_caption = captions_ids[idexs] - # b_wrong_caption = tl.prepro.pad_sequences(b_wrong_caption, padding='post') # mismatched text - ## get wrong image - idexs2 = get_random_int(min=0, max=n_images_train-1, number=batch_size) # remove if DCGAN only - b_wrong_images = images_train[idexs2] # remove if DCGAN only - ## get noise - b_z = np.random.uniform(low=-1, high=1, size=[batch_size, z_dim]).astype(np.float32) # paper said [0, 1], but [-1, 1] is better - ## check data - # print(np.min(b_real_images), np.max(b_real_images), b_real_images.shape) # [0, 1] (64, 64, 64, 3) - # for i, seq in enumerate(b_real_caption): - # # print(seq) - # print(i, " ".join([vocab.id_to_word(id) for id in seq])) - # save_images(b_real_images, [8, 8], 'real_image.png') - # exit() - - ## updates D - b_real_images = threading_data(b_real_images, prepro_img, mode='train') # [0, 255] --> [-1, 1] - b_wrong_images = threading_data(b_wrong_images, prepro_img, mode='train') - errD, _ = sess.run([d_loss, d_optim], feed_dict={ - t_real_image : b_real_images, - t_wrong_image : b_wrong_images, # remove if DCGAN only - t_real_caption : b_real_caption, # remove if DCGAN only - t_z : b_z}) - ## updates G - for _ in range(n_g_batch): - errG, _ = sess.run([g_loss, g_optim], feed_dict={ - t_real_caption : b_real_caption, # remove if DCGAN only - t_z : b_z}) - - print("Epoch: [%2d/%2d] [%4d/%4d] time: %4.4fs, d_loss: %.8f, g_loss: %.8f" \ - % (epoch, n_epoch, step, n_batch_epoch, time.time() - step_time, errD, errG)) - - # if np.isnan(errD) or np.isnan(errG): - # exit(" ** NaN error, stop training") - - if (epoch + 1) % print_freq == 0: - print(" ** Epoch %d took %fs" % (epoch, time.time()-start_time)) - img_gen, rnn_out = sess.run([net_g.outputs, net_rnn.outputs], - # img_gen = sess.run(net_g.outputs, - feed_dict={ - t_real_caption : sample_sentence, # remove if DCGAN only - t_z : sample_seed}) - - # print(b_real_images[0]) - print('rnn:', np.min(rnn_out[0]), np.max(rnn_out[0])) # -1.4121389, 1.4108921 - print('real:', b_real_images[0].shape, np.min(b_real_images[0]), np.max(b_real_images[0])) - print('wrong:', b_wrong_images[0].shape, np.min(b_wrong_images[0]), np.max(b_wrong_images[0])) - # print(img_gen[0]) - print('generate:', img_gen[0].shape, np.min(img_gen[0]), np.max(img_gen[0])) - img_gen = threading_data(img_gen, prepro_img, mode='rescale') # [-1, 1] --> [0, 1] - # tl.visualize.frame(img_gen[0], second=0, saveable=True, name='e_%d_%s' % (epoch, " ".join([vocab.id_to_word(id) for id in sample_sentence[0]])) ) - save_images(img_gen, [8, 8], '{}/train_{:02d}.png'.format('samples', epoch)) - # for i, img in enumerate(img_gen): - # tl.visualize.frame(img, second=0, saveable=True, name='epoch_%d_sample_%d_%s' % (epoch, i, [vocab.id_to_word(id) for id in sample_sentence[i]]) ) - # print(img_gen[:32]) - # print(img_gen[32:]) - # tl.visualize.images2d(images=img_gen, second=0.01, saveable=True, name='temp_generate', dtype=np.uint8) - - # b_real_images = threading_data(b_real_images, prepro_img, mode='rescale') - # b_wrong_images = threading_data(b_wrong_images, prepro_img, mode='rescale') - # save_images(b_real_images, [8, 8], 'temp_real_image.png') - # save_images(b_wrong_images, [8, 8], 'temp_wrong_image.png') - - if epoch % 50 == 0: - tl.files.save_npz(net_rnn.all_params, name=net_e_name, sess=sess) - tl.files.save_npz(net_g.all_params, name=net_g_name, sess=sess) - tl.files.save_npz(net_d.all_params, name=net_d_name, sess=sess) - net_e_name_e = os.path.join(save_dir, 'net_e_%d.npz' % epoch) - net_g_name_e = os.path.join(save_dir, 'net_g_%d.npz' % epoch) - net_d_name_e = os.path.join(save_dir, 'net_d_%d.npz' % epoch) - tl.files.save_npz(net_rnn.all_params, name=net_e_name_e, sess=sess) - tl.files.save_npz(net_g.all_params, name=net_g_name_e, sess=sess) - tl.files.save_npz(net_d.all_params, name=net_d_name_e, sess=sess) - print("[*] Saving checkpoints SUCCESS!") - # tl.visualize.images2d(images=img_gen, second=0.01, saveable=True, name='temp_generate_%d' % epoch)#, dtype=np.uint8) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#