model.py

# coding=utf-8
# tensorflow model graph 


import tensorflow as tf
from utils import flatten,reconstruct,Dataset,exp_mask
import numpy as np
import random,sys

VERY_NEGATIVE_NUMBER = -1e30

def get_model(config):
	with tf.name_scope(config.modelname), tf.device("/gpu:0"):
		model = Model(config,"model_%s"%config.modelname)

	return model


from copy import deepcopy # for C[i].insert(Y[i])

# a flatten and reconstruct version of softmax
def softmax(logits,scope=None):
	with tf.name_scope(scope or "softmax"): # noted here is name_scope not variable
		flat_logits = flatten(logits,1)
		flat_out = tf.nn.softmax(flat_logits)
		out = reconstruct(flat_out,logits,1)
		return out

# softmax selection
# return target * softmax(logits)
# target: [ ..., J, d]
# logits: [ ..., J]

# so [N,M,dim] * [N,M] -> [N,dim], so [N,M] is the attention for each M

# return: [ ..., d] # so the target vector is attended with logits' softmax
# [N,M,JX,JQ,2d] * [N,M,JX,JQ] (each context to query's mapping) -> [N,M,JX,2d] # attened the JQ dimension
def softsel(target,logits,hard=False,hardK=None,scope=None):
	with tf.variable_scope(scope or "softsel"): # there is no variable to be learn here
				
		a = softmax(logits) # shape is the same

		
		target_rank = len(target.get_shape().as_list())
		# [N,M,JX,JQ,2d] elem* [N,M,JX,JQ,1]
		return tf.reduce_sum(tf.expand_dims(a,-1)*target,target_rank-2) # second last dim


# x -> [Num,JX,W,embedding dim] # conv2d requires an input of 4d [batch, in_height, in_width, in_channels]
def conv1d(x,filter_size,height,keep_prob,is_train=None,wd=None,scope=None):
	with tf.variable_scope(scope or "conv1d"):
		num_channels = x.get_shape()[-1] # embedding dim[8]
		filter_var = tf.get_variable("filter",shape=[1,height,num_channels,filter_size],dtype="float")
		bias = tf.get_variable('bias',shape=[filter_size],dtype='float')
		strides = [1,1,1,1]
		# add dropout to input
		
		d = tf.nn.dropout(x,keep_prob=keep_prob)
		outd = tf.cond(is_train,lambda:d,lambda:x)
		#conv
		xc = tf.nn.relu(tf.nn.conv2d(outd,filter_var,strides,padding='VALID')+bias)
		# simple max pooling?
		out = tf.reduce_max(xc,2) # [-1,JX,num_channel]

		if wd is not None:
			add_wd(wd)

		return out

# fully-connected layer
# simple linear layer, without activatation # remember to add it
# [N,M,JX,JQ,2d] => x[N*M*JX*JQ,2d] * W[2d,output_size] -> 
def linear(x,output_size,scope,add_tanh=False,wd=None):
	with tf.variable_scope(scope):
		# since the input here is not two rank, we flat the input while keeping the last dims
		keep = 1
		#print x.get_shape().as_list()
		flat_x = flatten(x,keep) # keeping the last one dim # [N,M,JX,JQ,2d] => [N*M*JX*JQ,2d]
		#print flat_x.get_shape() # (?, 200) # wd+cwd
		bias_start = 0.0
		if not (type(output_size) == type(1)): # need to be get_shape()[k].value
			output_size = output_size.value

		#print [flat_x.get_shape()[-1],output_size]

		W = tf.get_variable("W",dtype="float",initializer=tf.truncated_normal([flat_x.get_shape()[-1].value,output_size],stddev=0.1))
		bias = tf.get_variable("b",dtype="float",initializer=tf.constant(bias_start,shape=[output_size]))
		flat_out = tf.matmul(flat_x,W)+bias


		if add_tanh:
			flat_out = tf.tanh(flat_out,name="tanh")

		if wd is not None:
			add_wd(wd)

		out = reconstruct(flat_out,x,keep)
		return out


def batch_norm(x,is_train=True,epsilon=1e-5,decay=0.9,scope=None):
	scope = scope or "batch_norm"
	# what about tf.nn.batch_normalization
	return tf.contrib.layers.batch_norm(x,decay=decay,updates_collections=None,epsilon=epsilon,scale=True,is_training=is_train,scope=scope)


# hinfo * att(hinfo,hq) -> attended_hinfo / [hinfo;attened_hinfo]
# hq -> [N,JQ,w]
# hinfo -> [N,M,J*,w] / [N,M,J*,JX,w]
# h_info_mask -> [N,M,J*]
# hinfo[N,M,J1,w]/[N,M,J1,J2,w] -> h_a[N,w]
# input: hinfo [N, ..., 2d], hq [N,JQ,2d] -> output: [N,2d]
# simiMatrix: what type of similarity matrix we use for the linear transform
# 	1: A, B , A*B
# 	2:(A-B)^2, A*B
# 	3: A, B, (A-B)^2, A*B
def attention(hinfo,hq,hinfo_mask=None,hq_mask=None,simiMatrix=1,wd=None,bidirect=False,scope=None):
	with tf.variable_scope(scope or "attention_2vector"):
		N = hinfo.get_shape().as_list()[0]
		w = hinfo.get_shape().as_list()[-1]
		
		M = tf.shape(hinfo)[1]
		JQ = tf.shape(hq)[1]

		#hinfo_rank = tf.rank(hinfo)

		hinfo = tf.reshape(hinfo,[N,-1,w]) # M*J / M*J*JX
		#print hinfo.get_shape().as_list(),hq.get_shape().as_list()
		if hinfo_mask is not None:
			hinfo_mask = tf.reshape(hinfo_mask,[N,-1])
		#vector length for hinfo
		V = tf.shape(hinfo)[1]
		# so hinfo -> [N,V,w], hinfo_mask -> [N,V]
		# change two matrix to be the same
		h_aug = tf.tile(tf.expand_dims(hinfo,2),[1,1,JQ,1])
		q_aug = tf.tile(tf.expand_dims(hq,1),[1,V,1,1])

		if (hinfo_mask is not None) and (hq_mask is not None):
			# change the mask too
			h_mask_aug = tf.tile(tf.expand_dims(hinfo_mask,2),[1,1,JQ])
			q_mask_aug = tf.tile(tf.expand_dims(hq_mask,1),[1,V,1])

			mask = h_mask_aug & q_mask_aug # [N,V,JQ]

		# TODO: Change this similarity function
		if simiMatrix == 1:
			a_logits = linear(tf.concat([h_aug,q_aug,h_aug*q_aug],3),output_size=1,scope="att_logits")
		elif simiMatrix == 2:
			a_logits = linear(tf.concat([(h_aug-q_aug)*(h_aug-q_aug),h_aug*q_aug],3),output_size=1,scope="att_logits")
		elif simiMatrix == 3:
			a_logits = linear(tf.concat([h_aug,q_aug,(h_aug-q_aug)*(h_aug-q_aug),h_aug*q_aug],3),output_size=1,scope="att_logits")
		else:
			print "similarity matrix not implemented"
			sys.exit()

		# [N,V,JQ,1] -> [N,V,JQ]
		a_logits = tf.squeeze(a_logits,3)

		# apply mask
		if (hinfo_mask is not None) and (hq_mask is not None):
			a_logits = exp_mask(a_logits,mask)

		# hinfo -> [N,V,w], * max([N,V,JQ]) [N,V] [so each info "word" 's max prob to the whole question]
		# h_a -> [N,w]
		#h_a = softsel(hinfo,tf.reduce_max(a_logits,2),hard=True,hardK=3)
		h_a = softsel(hinfo,tf.reduce_max(a_logits,2),hard=False)

		# add a reversed-directional vector here
		if bidirect:
			# q [N,JQ,w] -> q_aug : [N,V,JQ,w]  * [N,V,JQ] -> [N,V,w]
			q_a = softsel(q_aug,a_logits,hard=False) # each V attended with query

			# here we simply average them
			q_a = tf.reduce_mean(q_a,1) # [N,w]

			# concat two direction attended vector
			h_a = tf.concat([h_a,q_a],1) #[N,2w]

			# need output to be [N,2w]
			#h_a = linear(h_a,output_size=w,scope="combine_bidirect")


		if wd is not None:
			add_wd(wd)

		return h_a,a_logits


# https://github.com/YunseokJANG/tgif-qa/blob/master/code/gifqa/models/frameqa_models.py
"""
			vid_att, alpha = self.attention(rnn_final_state, vid_states)
            final_embed = tf.add(tf.nn.tanh(linear(vid_att, 2*self.hidden_dim)),
                                 rnn_final_state)
    # prev_hidden [N,d]
    # vid_states [N,T,d]
    def attention(self, prev_hidden, vid_states):
        packed = tf.pack(vid_states)
        packed = tf.transpose(packed, [1,0,2])
        vid_2d = tf.reshape(packed, [-1, self.hidden_dim*2])
        sent_2d = tf.tile(prev_hidden, [1, self.lstm_steps])
        sent_2d = tf.reshape(sent_2d, [-1, self.hidden_dim*2])
        preact = tf.add(linear(sent_2d, self.hidden_dim, name="preatt_sent"),
                        linear(vid_2d, self.hidden_dim, name="preadd_vid"))
        score = linear(tf.nn.tanh(preact), 1, name="preatt")
        score_2d = tf.reshape(score, [-1, self.lstm_steps])
        alpha = tf.nn.softmax(score_2d)
        alpha_3d = tf.reshape(alpha, [-1, self.lstm_steps, 1])
        return tf.reduce_sum(packed * alpha_3d, 1), alpha
"""
def attention_tgif(hinfo,lq,hinfo_mask=None,wd=None,mlp_dim=512,scope=None):
	with tf.variable_scope(scope or "attention_2vector"):
		N = hinfo.get_shape().as_list()[0]
		w = hinfo.get_shape().as_list()[-1]
		
		M = tf.shape(hinfo)[1]

		#hinfo_rank = tf.rank(hinfo)

		hinfo = tf.reshape(hinfo,[N,-1,w]) # M*J / M*J*JX
		#print hinfo.get_shape().as_list(),hq.get_shape().as_list()
		if hinfo_mask is not None:
			hinfo_mask = tf.reshape(hinfo_mask,[N,-1])
		#vector length for hinfo
		V = tf.shape(hinfo)[1]
		# so hinfo -> [N,V,w], hinfo_mask -> [N,V]
		# lq -> [N,d]
		# transform both into a 
		#mlp_dim = 512
		q_in = linear(lq,output_size=mlp_dim,scope="mlp_q")
		h_in = linear(hinfo,output_size=mlp_dim,scope="mlp_h") # [N,V,mlp_dim]
		q_in_tile = tf.tile(tf.expand_dims(q_in,1),[1,V,1]) # [N,V,mlp_dim]
		preatt = tf.add(q_in_tile,h_in) #[N,V,mlp_dim]
		score = linear(preatt,output_size=1,scope="preatt")
		score = tf.squeeze(score,2) # [N,V]
		att = tf.nn.softmax(score)

		att = exp_mask(att,hinfo_mask)
		attended = tf.expand_dims(att,2) #[N,V,1]
		attended = tf.reduce_sum(hinfo*attended,1)
		logits = tf.add(tf.nn.tanh(linear(attended,output_size=2*mlp_dim,scope="final")),lq) #[N,1024]

		if wd is not None:
			add_wd(wd)

		return logits,att

# keep the M dim as well
def attention_keeprank1(hinfo,hq,hinfo_mask=None,hq_mask=None,simiMatrix=1,wd=None,bidirect=False,scope=None):
	with tf.variable_scope(scope or "attention_2vector"):
		N = hinfo.get_shape().as_list()[0]
		w = hinfo.get_shape().as_list()[-1]
		
		M = tf.shape(hinfo)[1]
		JQ = tf.shape(hq)[1]

		#hinfo_rank = tf.rank(hinfo)

		hinfo = tf.reshape(hinfo,[N,M,-1,w]) # M*J / M*J*JX
		#print hinfo.get_shape().as_list(),hq.get_shape().as_list()
		if hinfo_mask is not None:
			hinfo_mask = tf.reshape(hinfo_mask,[N,M,-1])
		#vector length for hinfo
		V = tf.shape(hinfo)[2]
		# so hinfo -> [N,M,V,w], hinfo_mask -> [N,M,V]
		# change two matrix to be the same
		h_aug = tf.tile(tf.expand_dims(hinfo,3),[1,1,1,JQ,1])
		q_aug = tf.tile(tf.expand_dims(tf.expand_dims(hq,1),1),[1,M,V,1,1])

		if (hinfo_mask is not None) and (hq_mask is not None):
			# change the mask too
			h_mask_aug = tf.tile(tf.expand_dims(hinfo_mask,3),[1,1,1,JQ])
			q_mask_aug = tf.tile(tf.expand_dims(tf.expand_dims(hq_mask,1),1),[1,M,V,1])

			mask = h_mask_aug & q_mask_aug # [N,M,V,JQ]

		# TODO: Change this similarity function
		if simiMatrix == 1:
			a_logits = linear(tf.concat([h_aug,q_aug,h_aug*q_aug],4),output_size=1,scope="att_logits")
		elif simiMatrix == 2:
			a_logits = linear(tf.concat([(h_aug-q_aug)*(h_aug-q_aug),h_aug*q_aug],4),output_size=1,scope="att_logits")
		elif simiMatrix == 3:
			a_logits = linear(tf.concat([h_aug,q_aug,(h_aug-q_aug)*(h_aug-q_aug),h_aug*q_aug],4),output_size=1,scope="att_logits")
		else:
			print "similarity matrix not implemented"
			sys.exit()

		# [N,V,JQ,1] -> [N,V,JQ]
		a_logits = tf.squeeze(a_logits,4)

		# apply mask
		if (hinfo_mask is not None) and (hq_mask is not None):
			a_logits = exp_mask(a_logits,mask)

		# hinfo -> [N,M,V,w], * max([N,M,V,JQ]) -> [N,M,V] [so each info "word" 's max prob to the whole question]
		# h_a -> [N,M,w]
		#h_a = softsel(hinfo,tf.reduce_max(a_logits,3),hard=True,hardK=3)
		h_a = softsel(hinfo,tf.reduce_max(a_logits,3),hard=False)

		# add a reversed-directional vector here
		if bidirect:
			# q [N,JQ,w] -> q_aug : [N,M,V,JQ,w]  * [N,M,V,JQ] -> [N,M,V,w]
			q_a = softsel(q_aug,a_logits,hard=False) # each V attended with query

			# here we simply average them
			q_a = tf.reduce_mean(q_a,2) # [N,M,w]

			# concat two direction attended vector
			h_a = tf.concat([h_a,q_a],2) #[N,M,2w]

			# need output to be [N,2w]
			#h_a = linear(h_a,output_size=w,scope="combine_bidirect")


		if wd is not None:
			add_wd(wd)

		return h_a

# add current scope's variable's l2 loss to loss collection
def add_wd(wd,scope=None):
	if wd != 0.0:
		scope = scope or tf.get_variable_scope().name
		vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
		with tf.variable_scope("weight_decay"):
			for var in vars_:
				weight_decay = tf.multiply(tf.nn.l2_loss(var),wd,name="%s/wd"%(var.op.name))
				tf.add_to_collection("losses",weight_decay)

def get_initializer(matrix):
	def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix
	return _initializer

class Model():
	def __init__(self,config,scope):
		self.scope = scope
		self.config = config
		# a step var to keep track of current training process
		self.global_step = tf.get_variable('global_step',shape=[],dtype='int32',initializer=tf.constant_initializer(0),trainable=False) # a counter

		# get all the dimension here
		N = self.N = config.batch_size
		
		VW = self.VW = config.word_vocab_size
		VC = self.VC = config.char_vocab_size
		W = self.W = config.max_word_size

		# embedding dim
		self.cd,self.wd,self.cwd = config.char_emb_size,config.word_emb_size,config.char_out_size

		# image dimension
		self.idim = config.image_feat_dim

		self.num_choice = 4
		# these could be used for visualization
		self.C = tf.constant(-1) # the time correlation matrix
		self.C_win = tf.constant(-1)
		self.att_logits = tf.constant(-1) # the 3d attention logits
		self.q_att_logits = tf.constant(-1) # the question attention logits if there is
		self.JXP = tf.constant(-1)
		self.warp_h = tf.constant(-1)

		self.hat_len = tf.constant(-1)
		self.had_len = tf.constant(-1)
		self.hwhen_len = tf.constant(-1)
		self.hwhere_len = tf.constant(-1)
		self.hpis_len = tf.constant(-1)
		self.hpts_len = tf.constant(-1)
		self.hall = tf.constant(-1)

		# step limits
		# M -> album max num
		# -----JX -> title max words (album title,photo title)
		# JXA -> album title max words
		# JXP -> photo title max words
		# JD -> album description max word
		# JT -> album when max word
		# JG -> album where max word

		# JI -> album max photo

		# JA -> max answer (choice) length
		# JQ -> max question length

		# all the inputs

		# album title
		# [N,M,JXA]
		self.at = tf.placeholder('int32',[N,None,None],name="at")
		self.at_c = tf.placeholder("int32",[N,None,None,W],name="at_c")
		self.at_mask = tf.placeholder("bool",[N,None,None],name="at_mask") # to get the sequence length

		# album description
		# [N,M,JD]
		self.ad = tf.placeholder('int32',[N,None,None],name="ad")
		self.ad_c = tf.placeholder("int32",[N,None,None,W],name="ad_c")
		self.ad_mask = tf.placeholder("bool",[N,None,None],name="ad_mask")

		# album when, where
		# [N,M,JT/JG]
		self.when = tf.placeholder("int32",[N,None,None],name="when")
		self.when_c = tf.placeholder("int32",[N,None,None,W],name="when_c")
		self.when_mask = tf.placeholder("bool",[N,None,None],name="when_mask")
		self.where = tf.placeholder("int32",[N,None,None],name="where")
		self.where_c = tf.placeholder("int32",[N,None,None,W],name="where_c")
		self.where_mask = tf.placeholder("bool",[N,None,None],name="where_mask")

		# photo titles
		# [N,M,JI,JXP]
		self.pts = tf.placeholder('int32',[N,None,None,None],name="pts")
		self.pts_c = tf.placeholder("int32",[N,None,None,None,W],name="pts_c")
		self.pts_mask = tf.placeholder("bool",[N,None,None,None],name="pts_mask")

		# for vis
		self.JXP = tf.shape(self.pts)[3]

		# photo
		# [N,M,JI] # each is a photo index
		self.pis = tf.placeholder('int32',[N,None,None],name="pis")
		self.pis_mask = tf.placeholder("bool",[N,None,None],name="pis_mask")

		# question
		self.q = tf.placeholder('int32',[N,None],name="q")
		self.q_c = tf.placeholder('int32', [N, None, W], name='q_c')
		self.q_mask = tf.placeholder("bool",[N,None],name="q_mask")

		# answer + choice words
		# [N,4,JA]
		self.choices = tf.placeholder("int32",[N,self.num_choice,None],name="choices")
		self.choices_c = tf.placeholder("int32",[N,self.num_choice,None,W],name="choices_c")
		self.choices_mask = tf.placeholder("bool",[N,self.num_choice,None],name="choices_mask")

		# 4 choice classification
		self.y = tf.placeholder('bool', [N, self.num_choice], name='y')
		

		# feed in the pretrain word vectors for all batch
		self.existing_emb_mat = tf.placeholder('float',[None,config.word_emb_size],name="pre_emb_mat")

		# feed in the image feature for this batch
		# [photoNumForThisBatch,image_dim]
		self.image_emb_mat = tf.placeholder("float",[None,config.image_feat_dim],name="image_emb_mat")

		# used for drop out switch
		self.is_train = tf.placeholder('bool', [], name='is_train')

		# forward output
		# the following will be added in build_forward and build_loss()
		self.logits = None

		self.yp = None # prob

		self.loss = None

		self.build_forward()
		self.build_loss()

		self.summary = tf.summary.merge_all() # for visualize and stuff? # not used now


	def build_forward(self):
		config = self.config
		VW = self.VW
		VC = self.VC
		W = self.W
		N = self.N

		# dynamic decide some step, for sequence length
		M = tf.shape(self.pis)[1] # photo num
		JXA = tf.shape(self.at)[2] # for album title, photo title
		JD = tf.shape(self.ad)[2] # description length
		JT = tf.shape(self.when)[2]
		JG = tf.shape(self.where)[2]

		JI = tf.shape(self.pis)[2] # used for photo_title, photo
		JXP = tf.shape(self.pts)[3]

		JQ = tf.shape(self.q)[1]
		JA = tf.shape(self.choices)[2]

		# embeding size
		cdim,wdim,cwdim = self.cd,self.wd,self.cwd #cwd: char -> word output dimension
		# image feature dim
		idim = self.idim # image_feat dimension

		# all input:
		#	at, ad, when, where, 
		#	pts, pis
		#	q, choices

		# embedding
		with tf.variable_scope('emb'):
			# char stuff
			if config.use_char:
			#with tf.variable_scope("char"):
				# [char_vocab_size,char_emb_dim]
				with tf.variable_scope("var"): #, tf.device("/cpu:0"): # in cpu for faster in multi-gpu training
					char_emb = tf.get_variable("char_emb",shape=[VC,cdim],dtype="float")

				# the embedding for each of character 
				# [N,M,JXA,W]
				Aat_c = tf.nn.embedding_lookup(char_emb,self.at_c)
				# [N,M,JD,W]
				Aad_c = tf.nn.embedding_lookup(char_emb,self.ad_c)
				# [N,M,JT,W]
				Awhen_c = tf.nn.embedding_lookup(char_emb,self.when_c)
				# [N,M,JG,W]
				Awhere_c = tf.nn.embedding_lookup(char_emb,self.where_c)
				# [N,M,JI,JXP,W] -> [N,M,JI,JXP,W,cdim]
				Apts_c = tf.nn.embedding_lookup(char_emb,self.pts_c)

				# [N,JQ,W]
				Aq_c = tf.nn.embedding_lookup(char_emb,self.q_c)
				Achoices_c = tf.nn.embedding_lookup(char_emb,self.choices_c)

				# flatten for conv2d input like images
				Aat_c = tf.reshape(Aat_c,[-1,JXA,W,cdim])
				Aad_c = tf.reshape(Aad_c,[-1,JD,W,cdim])
				Awhen_c = tf.reshape(Awhen_c,[-1,JT,W,cdim])
				Awhere_c = tf.reshape(Awhere_c,[-1,JG,W,cdim])
				# [N*M*JI,JXP,W,cdim]
				Apts_c = tf.reshape(Apts_c,[-1,JXP,W,cdim])

				Aq_c = tf.reshape(Aq_c,[-1,JQ,W,cdim])
				# [N*4,]
				Achoices_c = tf.reshape(Achoices_c,[-1,JA,W,cdim])

				#char CNN
				filter_size = cwdim # output size for each word
				filter_height = 5
				with tf.variable_scope("conv"):
					xat = conv1d(Aat_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")
					tf.get_variable_scope().reuse_variables()
					xad = conv1d(Aad_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")
					xwhen = conv1d(Awhen_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")
					xwhere = conv1d(Awhere_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")
					xpts = conv1d(Apts_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")
					qq = conv1d(Aq_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")
					qchoices = conv1d(Achoices_c,filter_size,filter_height,config.keep_prob,self.is_train,wd=config.wd,scope="conv1d")

					# reshape them back
					xat = tf.reshape(xat,[-1,M,JXA,cwdim])
					xad = tf.reshape(xad,[-1,M,JD,cwdim])
					xwhen = tf.reshape(xwhen,[-1,M,JT,cwdim])
					xwhere = tf.reshape(xwhere,[-1,M,JG,cwdim])
					xpts = tf.reshape(xpts,[-1,M,JI,JXP,cwdim])

					qq = tf.reshape(qq,[-1,JQ,cwdim])
					# [N,num_choice,JA,cwdim]
					qchoices = tf.reshape(qchoices,[-1,self.num_choice,JA,cwdim])

			# word stuff
			with tf.variable_scope('word'):
				with tf.variable_scope("var"):
					# get the word embedding for new words
					if config.is_train:
						# for new word
						word_emb_mat = tf.get_variable("word_emb_mat",dtype="float",shape=[VW,wdim],initializer=get_initializer(config.emb_mat)) # it's just random initialized
					else: # save time for loading the emb during test
						word_emb_mat = tf.get_variable("word_emb_mat",dtype="float",shape=[VW,wdim])
					# concat with pretrain vector
					# so 0 - VW-1 index for new words, the rest for pretrain vector
					# and the pretrain vector is fixed
					word_emb_mat = tf.concat([word_emb_mat,self.existing_emb_mat],0)

				#[N,M,JXA] -> [N,M,JXA,wdim]
				Aat = tf.nn.embedding_lookup(word_emb_mat,self.at)
				Aad = tf.nn.embedding_lookup(word_emb_mat,self.ad)
				Awhen = tf.nn.embedding_lookup(word_emb_mat,self.when)
				Awhere = tf.nn.embedding_lookup(word_emb_mat,self.where)
				Apts = tf.nn.embedding_lookup(word_emb_mat,self.pts)

				Aq = tf.nn.embedding_lookup(word_emb_mat,self.q)
				Achoices = tf.nn.embedding_lookup(word_emb_mat,self.choices)

			# concat char and word
			if config.use_char:

				xat = tf.concat([xat,Aat],3)
				xad = tf.concat([xad,Aad],3)
				xwhen = tf.concat([xwhen,Awhen],3)
				xwhere = tf.concat([xwhere,Awhere],3)
				# [N,M,JI,JX,wdim+cwdim]
				xpts = tf.concat([xpts,Apts],4)

				# [N,JQ,wdim+cwdim]
				qq = tf.concat([qq,Aq],2)
				qchoices = tf.concat([qchoices,Achoices],3)

			else:
				xat = Aat
				xad = Aad
				xwhen = Awhen
				xwhere = Awhere
				xpts = Apts

				qq = Aq
				qchoices = Achoices
				# all the above last dim is the same [wdim+cwdim] or just [wdim]

			# get the image feature
			with tf.variable_scope("image"):

				# [N,M,JI] -> [N,M,JI,idim]
				xpis = tf.nn.embedding_lookup(self.image_emb_mat,self.pis)

				# use image trans, then linearly transform it to lower dim
				# TODO: CNN transform?
				if config.use_image_trans:
					
					with tf.variable_scope("image_transform"):
						#[N,M,JI,idim] -> [N,M,JI,newdim]
						xpis = linear(xpis,add_tanh=config.add_tanh,output_size=config.image_trans_dim,wd=config.wd,scope="image_trans_linear")
						#xpis = tf.nn.relu(xpis)


		d = config.hidden_size

		# LSTM / GRU?
		cell_text = tf.nn.rnn_cell.BasicLSTMCell(d,state_is_tuple=True)
		cell_img = tf.nn.rnn_cell.BasicLSTMCell(d,state_is_tuple=True)
		#cell = tf.nn.rnn_cell.GRUCell(d)
		# add dropout
		keep_prob = tf.cond(self.is_train,lambda:tf.constant(config.keep_prob),lambda:tf.constant(1.0))

		cell_text = tf.nn.rnn_cell.DropoutWrapper(cell_text,keep_prob)

		cell_img = tf.nn.rnn_cell.DropoutWrapper(cell_img,keep_prob)

		
		# it is important to think about which LSTM shared with which?

		# sequence length for each
		at_len = tf.reduce_sum(tf.cast(self.at_mask,"int32"),2) # [N,M] # each album's title length
		ad_len = tf.reduce_sum(tf.cast(self.ad_mask,"int32"),2)
		when_len = tf.reduce_sum(tf.cast(self.when_mask,"int32"),2)
		where_len = tf.reduce_sum(tf.cast(self.where_mask,"int32"),2) # [N,M]

		pis_len = tf.reduce_sum(tf.cast(self.pis_mask,"int32"),2) #[N,M,JI] #[N,M]

		pts_len = tf.reduce_sum(tf.cast(self.pts_mask,"int32"),3) # [N,M,JI,JXP] -> [N,M,JI]

		q_len = tf.reduce_sum(tf.cast(self.q_mask,"int32"),1) # [N] # each question 's length

		choices_len = tf.reduce_sum(tf.cast(self.choices_mask,"int32"),2) # [N,4]

		# xat -> [N,M,JXA,wdim+cwdim]
		# xad -> [N,M,JD,wdim+cwdim]
		# xwhen/xwhere -> [N,M,JT/JG,wdim+cwdim]
		# xpts -> [N,M,JI,JXP,wdim+cwdim]

		# xpis -> [N,M,JI,idim]

		# qq -> [N,JQ,wdim+cwdim]
		# qchoices -> [N,4,JA,wdim+cwdim]

		# roll the sentence into lstm for context and question
		# from [N,M,JI,JX] -> [N,M,2d]
		with tf.variable_scope("reader"):
			with tf.variable_scope("text"):
				(fw_hq,bw_hq),(fw_lq,bw_lq) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,qq,sequence_length=q_len,dtype="float",scope="utext")
				# concat the fw and backward lstm output
				hq = tf.concat([fw_hq,bw_hq],2)
				lq = tf.concat([fw_lq.h,bw_lq.h],1) #LSTM CELL
				#lq = tf.concat([fw_lq,bw_lq],1) # GRU

				tf.get_variable_scope().reuse_variables()

				# flat all
				# choices
				flat_qchoices = flatten(qchoices,2) # [N,4,JA,dim] -> [N*4,JA,dim]
				# album title
				flat_xat = flatten(xat,2) #[N,M,JXA,dim] -> [N*M,JXA,dim]
				flat_xad = flatten(xad,2)
				flat_xwhen = flatten(xwhen,2)
				flat_xwhere = flatten(xwhere,2)
				
				#print "flat_xpis shape:%s"%(flat_xpis.get_shape())

				# photo tiles
				flat_xpts = flatten(xpts,2) # [N,M,JI,JXP,dim] -> [N*M*JI,JXP,dim]
				#print "flat_xpts shape:%s"%(flat_xpts.get_shape())

				# get the sequence length, all one dim
				flat_qchoices_len = flatten(choices_len,0) # [N*4]
				flat_xat_len = flatten(at_len,0) # [N*M]
				flat_xad_len = flatten(ad_len,0) # [N*M]
				flat_xwhen_len = flatten(when_len,0) # [N*M]
				flat_xwhere_len = flatten(where_len,0) # [N*M]
				
				flat_xpts_len = flatten(pts_len,0) # [N*M*JI]

				
				# put all through LSTM
				# uncomment to use ALL LSTM output or LAST LSTM output

				# album title
				# [N*M,JXA,d]
				(fw_hat_flat,bw_hat_flat),(fw_lat_flat,bw_lat_flat) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,flat_xat,sequence_length=flat_xat_len,dtype="float",scope="utext")
				fw_hat = reconstruct(fw_hat_flat,xat,2) # 
				bw_hat = reconstruct(bw_hat_flat,xat,2)
				hat = tf.concat([fw_hat,bw_hat],3) # [N,M,JXA,2d]
				# lstm
				fw_lat = tf.reshape(fw_lat_flat.h,[N,M,d]) # [N*M,d] -> [N,M,d]
				bw_lat = tf.reshape(bw_lat_flat.h,[N,M,d])
				# GRU
				#fw_lat = tf.reshape(fw_lat_flat,[N,M,d]) # [N*M,d] -> [N,M,d]
				#bw_lat = tf.reshape(bw_lat_flat,[N,M,d])
				lat = tf.concat([fw_lat,bw_lat],2) # [N,M,2d]


				# album desciption
				# [N*M,JD,d]
				(fw_had_flat,bw_had_flat),(fw_lad_flat,bw_lad_flat) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,flat_xad,sequence_length=flat_xad_len,dtype="float",scope="utext")
				fw_had = reconstruct(fw_had_flat,xad,2) # 
				bw_had = reconstruct(bw_had_flat,xad,2)
				had = tf.concat([fw_had,bw_had],3) # [N,M,JD,2d]
				# LSTM
				fw_lad = tf.reshape(fw_lad_flat.h,[N,M,d]) # [N*M,d] -> [N,M,d]
				bw_lad = tf.reshape(bw_lad_flat.h,[N,M,d])
				# GRU
				#fw_lad = tf.reshape(fw_lad_flat,[N,M,d]) # [N*M,d] -> [N,M,d]
				#bw_lad = tf.reshape(bw_lad_flat,[N,M,d])
				lad = tf.concat([fw_lad,bw_lad],2) # [N,M,2d]

				# when
				(fw_hwhen_flat,bw_hwhen_flat),(fw_lwhen_flat,bw_lwhen_flat) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,flat_xwhen,sequence_length=flat_xwhen_len,dtype="float",scope="utext")
				fw_hwhen = reconstruct(fw_hwhen_flat,xwhen,2) # 
				bw_hwhen = reconstruct(bw_hwhen_flat,xwhen,2)
				hwhen = tf.concat([fw_hwhen,bw_hwhen],3) # [N,M,JT,2d]
				# LSTM
				fw_lwhen = tf.reshape(fw_lwhen_flat.h,[N,M,d]) # [N*M,d] -> [N,M,d]
				bw_lwhen = tf.reshape(bw_lwhen_flat.h,[N,M,d])
				# GRU
				#fw_lwhen = tf.reshape(fw_lwhen_flat,[N,M,d]) # [N*M,d] -> [N,M,d]
				#bw_lwhen = tf.reshape(bw_lwhen_flat,[N,M,d])
				lwhen = tf.concat([fw_lwhen,bw_lwhen],2) # [N,M,2d]


				# where
				(fw_hwhere_flat,bw_hwhere_flat),(fw_lwhere_flat,bw_lwhere_flat) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,flat_xwhere,sequence_length=flat_xwhere_len,dtype="float",scope="utext")
				fw_hwhere = reconstruct(fw_hwhere_flat,xwhere,2) # 
				bw_hwhere = reconstruct(bw_hwhere_flat,xwhere,2)
				hwhere = tf.concat([fw_hwhere,bw_hwhere],3) # [N,M,JG,2d]
				# LSTM
				fw_lwhere = tf.reshape(fw_lwhere_flat.h,[N,M,d]) # [N*M,d] -> [N,M,d]
				bw_lwhere = tf.reshape(bw_lwhere_flat.h,[N,M,d])
				# GRU
				#fw_lwhere = tf.reshape(fw_lwhere_flat,[N,M,d]) # [N*M,d] -> [N,M,d]
				#bw_lwhere = tf.reshape(bw_lwhere_flat,[N,M,d])
				lwhere = tf.concat([fw_lwhere,bw_lwhere],2) # [N,M,2d]


				# photo title
				# [N*M*JI,JXP,d]
				(fw_hpts_flat,bw_hpts_flat),(fw_lpts_flat,bw_lpts_flat) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,flat_xpts,sequence_length=flat_xpts_len,dtype="float",scope="utext")
				fw_hpts = reconstruct(fw_hpts_flat,xpts,2) # 
				bw_hpts = reconstruct(bw_hpts_flat,xpts,2) # [N,M,JI,JXP,d]
				hpts = tf.concat([fw_hpts,bw_hpts],4) # [N,M,JI,JXP,2d]
				# LSTM
				fw_lpts = tf.reshape(fw_lpts_flat.h,[N,M,JI,d]) # [N*M*JI,d] -> [N,M,JI,d]
				bw_lpts = tf.reshape(bw_lpts_flat.h,[N,M,JI,d])
				# GRU
				#fw_lpts = tf.reshape(fw_lpts_flat,[N,M,JI,d]) # [N*M*JI,d] -> [N,M,JI,d]
				#bw_lpts = tf.reshape(bw_lpts_flat,[N,M,JI,d])
				lpts = tf.concat([fw_lpts,bw_lpts],3) # [N,M,JI,2d]	

				# choices
				(fw_hchoices_flat,bw_hchoices_flat),(fw_lchoices_flat,bw_lchoices_flat) = tf.nn.bidirectional_dynamic_rnn(cell_text,cell_text,flat_qchoices,sequence_length=flat_qchoices_len,dtype="float",scope="utext")
				fw_hchoices = reconstruct(fw_hchoices_flat,qchoices,2) # 
				bw_hchoices = reconstruct(bw_hchoices_flat,qchoices,2)
				hchoices = tf.concat([fw_hchoices,bw_hchoices],3) # [N,4,JA,2d]
				# LSTM
				fw_lchoices = tf.reshape(fw_lchoices_flat.h,[N,-1,d]) # [N*4,d] -> [N,4,d]
				bw_lchoices = tf.reshape(bw_lchoices_flat.h,[N,-1,d])
				# GRU
				#fw_lchoices = tf.reshape(fw_lchoices_flat,[N,-1,d]) # [N*4,d] -> [N,4,d]
				#bw_lchoices = tf.reshape(bw_lchoices_flat,[N,-1,d])
				lchoices = tf.concat([fw_lchoices,bw_lchoices],2) # [N,4,2d]


			with tf.variable_scope("image"):
				# photos
				flat_xpis = flatten(xpis,2) # [N,M,JI,idim] -> [N*M,JI,idim]
				flat_xpis_len = flatten(pis_len,0) # [N*M]


				# photo # use different LSTM
				# [N*M,JXP,d]
				(fw_hpis_flat,bw_hpis_flat),(fw_lpis_flat,bw_lpis_flat) = tf.nn.bidirectional_dynamic_rnn(cell_img,cell_img,flat_xpis,sequence_length=flat_xpis_len,dtype="float",scope="uimage")
				fw_hpis = reconstruct(fw_hpis_flat,xpis,2) # 
				bw_hpis = reconstruct(bw_hpis_flat,xpis,2) # [N,M,JI,JXP,d]
				hpis = tf.concat([fw_hpis,bw_hpis],3) # [N,M,JI,2d]
				# LSTM
				fw_lpis = tf.reshape(fw_lpis_flat.h,[N,M,d]) # [N*M,d] -> [N,M,d]
				bw_lpis = tf.reshape(bw_lpis_flat.h,[N,M,d])
				# GRU
				#fw_lpis = tf.reshape(fw_lpis_flat,[N,M,d]) # [N*M,d] -> [N,M,d]
				#bw_lpis = tf.reshape(bw_lpis_flat,[N,M,d])
				lpis = tf.concat([fw_lpis,bw_lpis],2) # [N,M,2d]

			if config.wd is not None: # l2 weight decay for the reader
				add_wd(config.wd)

		# all rnn output
		# hq -> [N,JQ,2d]

		# hat -> [N,M,JXA,2d]
		# had -> [N,M,JD,2d]
		# hwhen -> [N,M,JT,2d]
		# hwhere -> [N,M,JG,2d]
		# hpts -> [N,M,JI,JXP,2d]
		# hpis -> [N,M,JI,2d]

		# hchoices -> [N,4,JA,2d]

		# last states:
		# lq -> [N,2d]

		# lat -> [N,M,2d]
		# lad -> [N,M,2d]
		# lwhen -> [N,M,2d]
		# lwhere -> [N,M,2d]
		# lpts -> [N,M,JI,2d]
		# lpis -> [N,M,2d]

		# lchoices -> [N,4,2d]


		# now from [N,M,2d] -> [N,2d]
		# attention layer
		with tf.variable_scope("attention"): # for baseline here has no attention

			# multi layer attention [album level <- photo level]
			if config.use_ml_att:
				with tf.variable_scope("multi_layer_attention"):
					# hat -> [N,M,JXA,2d] # hq -> [N,JQ,2d]
					# g1at -> [N,2d]
					g1at,self.ml_at_att_logits = attention(hat,hq,self.at_mask,self.q_mask,simiMatrix=config.simiMatrix,wd=config.wd,bidirect=config.use_bidirection,scope="at")
					g1ad,self.ml_ad_att_logits = attention(had,hq,self.ad_mask,self.q_mask,simiMatrix=config.simiMatrix,wd=config.wd,bidirect=config.use_bidirection,scope='ad')
					# when and where no need for attend? # need, 0.01 diff
					g1when,self.ml_when_att_logits = attention(hwhen,hq,self.when_mask,self.q_mask,simiMatrix=config.simiMatrix,wd=config.wd,bidirect=config.use_bidirection,scope='when')
					g1where,self.ml_where_att_logits = attention(hwhere,hq,self.where_mask,self.q_mask,simiMatrix=config.simiMatrix,wd=config.wd,bidirect=config.use_bidirection,scope='where')
					"""
					g0when = lwhen
					g0where = lwhere
					g1when = tf.reduce_mean(g0when,1)
					g1where = tf.reduce_mean(g0where,1)
					"""

					g1pts,self.ml_pts_att_logits = attention(hpts,hq,self.pts_mask,wd=config.wd,scope='pts')
					g1pis,self.ml_pis_att_logits = attention(hpis,hq,self.pis_mask,wd=config.wd,scope="pis")
			elif config.use_tgif_ml_att:
				with tf.variable_scope("multi_layer_attention"):
					g1at,self.ml_at_att_logits = attention_tgif(hat,lq,self.at_mask,mlp_dim=d,wd=config.wd,scope="at")
					g1ad,self.ml_ad_att_logits = attention_tgif(had,lq,self.ad_mask,mlp_dim=d,wd=config.wd,scope='ad')
					# when and where no need for attend? # need, 0.01 diff
					g1when,self.ml_when_att_logits = attention_tgif(hwhen,lq,self.when_mask,mlp_dim=d,wd=config.wd,scope='when')
					g1where,self.ml_where_att_logits = attention_tgif(hwhere,lq,self.where_mask,mlp_dim=d,wd=config.wd,scope='where')
					"""
					g0when = lwhen
					g0where = lwhere
					g1when = tf.reduce_mean(g0when,1)
					g1where = tf.reduce_mean(g0where,1)
					"""

					g1pts,self.ml_pts_att_logits = attention_tgif(hpts,lq,self.pts_mask,mlp_dim=d,wd=config.wd,scope='pts')
					g1pis,self.ml_pis_att_logits = attention_tgif(hpis,lq,self.pis_mask,mlp_dim=d,wd=config.wd,scope="pis")
					self.att_logits = self.ml_pis_att_logits
			else:
				# use last lstm output (last hidden state)
				# outputs_fw[k,X_len[k]-1] == states_fw.h[k]
				# at_len -> [N,M]
				g0at = lat #[N,M,2d]
				g0ad = lad # [N,M,2d]
				g0when = lwhen
				g0where = lwhere
				g0pts = tf.reduce_mean(lpts,2) #[N,M,JI,2d] -> [N,M,2d]
				g0pis = lpis
				# album level attention
				g1at = tf.reduce_mean(g0at,1) # [N,2d]
				g1ad = tf.reduce_mean(g0ad,1)
				g1when = tf.reduce_mean(g0when,1)
				g1where = tf.reduce_mean(g0where,1)
				g1pts = tf.reduce_mean(g0pts,1)
				g1pis = tf.reduce_mean(g0pis,1)

			K = 6
			if config.concat:
				g1_a = tf.concat([g1at,g1ad,g1when,g1where,g1pts,g1pis],axis=1) # [N,2d*6]
			else:
				# stack them
				
				g1 = tf.stack([g1at,g1ad,g1when,g1where,g1pts,g1pis],axis=1) # [N,K,2d]

				# need to squash since hq is 2d
				if config.use_bidirection:
					g1 = linear(g1,output_size=2*d,scope="bidrection_squash")
				

				# here we use the multi-modal attention
				if config.use_mm_att:
					with tf.variable_scope("multi_modal_attention"):
						
						g1_a,self.mm_att_logits = attention(g1,hq,hq_mask=self.q_mask,simiMatrix=config.simiMatrix,wd=config.wd,bidirect=config.use_bidirection,scope="mm_att")
						if config.use_bidirection:
							g1_a = linear(g1_a,output_size=2*d,scope="bidrection_squash")
				else:
					# average them simply
					g1_a = tf.reduce_mean(g1,1) # [N,K,2d] -> #[N,2d]
					# or concat them, # [N,K,2d] -> [N,K*2d]
					#g1_a = tf.reshape(g1_a,[N,-1])

			
			# direct links
			if config.use_direct_links:
				with tf.variable_scope("direct_links"):
					# hat -> [N,M,JXA,2d]
					# had -> [N,M,JD,2d]
					# hwhen -> [N,M,JT,2d]
					# hwhere -> [N,M,JG,2d]
					# hpts -> [N,M,JI,JXP,2d]
					# hpis -> [N,M,JI,2d]
					hat_re = tf.reshape(hat,[N,-1,hat.get_shape().as_list()[-1]])
					had_re = tf.reshape(had,[N,-1,had.get_shape().as_list()[-1]])
					hwhen_re = tf.reshape(hwhen,[N,-1,hwhen.get_shape().as_list()[-1]])
					hwhere_re = tf.reshape(hwhere,[N,-1,hwhere.get_shape().as_list()[-1]])
					hpts_re = tf.reshape(hpts,[N,-1,hpts.get_shape().as_list()[-1]])
					hpis_re = tf.reshape(hpis,[N,-1,hpis.get_shape().as_list()[-1]])
					full = tf.concat([
						hat_re,
						had_re,
						hwhen_re,
						hwhere_re,
						hpts_re,
						hpis_re,
					],axis=1) # [N,-1,2d]

					# remember this to visual attention weights
					self.hat_len = tf.shape(hat_re)[1]
					self.had_len = tf.shape(had_re)[1]
					self.hwhen_len = tf.shape(hwhen_re)[1]
					self.hwhere_len = tf.shape(hwhere_re)[1]
					self.hpis_len = tf.shape(hpis_re)[1]
					self.hpts_len = tf.shape(hpts_re)[1]

					# hq [N,JQ,2d]
					full_a,att_logits = attention(full, hq, simiMatrix=config.simiMatrix, wd=config.wd, scope="full_att") # [N,2d]
					self.att_logits = att_logits

				if config.direct_links_only:
					g1_all = full_a
				else:
					g1_all = full_a + g1_a
			else:
				g1_all = g1_a


		with tf.variable_scope("choices_emb"):
			# embed the choices
			# hchoices -> [N,4,JA,2d]
			# gchoices -> [N,4,2d]
			# maybe attend something in the future
			#gchoices = tf.reduce_mean(hchoices,2)# [N,4,2d]
			# hq -> [N,JQ,2d]
			if config.use_choices_att:
				gchoices = attention_keeprank1(hchoices, hq, self.choices_mask,self.q_mask, simiMatrix=config.simiMatrix, wd=config.wd,bidirect=config.use_bidirection, scope="choices_att")
				if config.use_bidirection:
					gchoices = linear(gchoices,output_size=2*d,scope="bidrection_squash")
			else:
				gchoices = lchoices #[N,4,2d] # last LSTM state for each choice

		with tf.variable_scope("question_emb"):
			# hq -> [N,JQ,2d]
			# g1 -> [N,K,2d]
			# gp -> [N,2d]
			if config.use_question_att:
				gq,self.q_att_logits = attention(hq, g1, self.q_mask, simiMatrix=config.simiMatrix, wd=config.wd,bidirect=config.use_bidirection, scope="question_att")
				if config.use_bidirection:
					gq = linear(gq,output_size=2*d,scope="bidrection_squash")
			else:
				gq = lq # this is the last hidden state of each question # [N,2d]

		# the modeling layer
		with tf.variable_scope("output"):

			if config.concat:
				# g1_all is [N,2d*K]
				# so change other vector dimension
				gchoices = linear(gchoices,output_size=2*d*K,scope="gchoice_trans_concat")
				gq = linear(gq,output_size=2*d*K,scope="gq_trans_concat")
			
			# g1_a [N,2d] # this could be viewed as an answer representation
			# together with the choices_emb and question_emb, 
			# we do a single layer multi class classification

			# tile g1_a [N,2d] -> [N,1,2d] to concat with gchoices
			# [N,4,2d]
			g1_a_t = tf.tile(tf.expand_dims(g1_all,1),[1,self.num_choice,1])

			# tile gq for all choices
			gq = tf.tile(tf.expand_dims(gq,1),[1,self.num_choice,1]) # [N,4,2d]


			# [N,4,2d] -> [N*4,1]
			# TODO: consider different similarity matrix
			# just g1 and gchoices
			#logits = linear(tf.concat([g1_a_t,gchoices,g1_a_t*gchoices],2),output_size=1,wd=config.wd,scope="choicelogits")
			if config.use_eu_output:
				# eu similarity
				logits = linear(tf.concat([gq,g1_a_t,gchoices,g1_a_t*gchoices,gq*gchoices,(g1_a_t-gchoices)*(g1_a_t - gchoices),(gq-gchoices)*(gq-gchoices)],2),output_size=1,scope="choicelogits")
			else:
				logits = linear(tf.concat([gq,g1_a_t,gchoices,g1_a_t*gchoices,gq*gchoices],2),output_size=1,scope="choicelogits")
			

			logits = tf.squeeze(logits,2) # [N,4,1] -> [N,4]
			yp = tf.nn.softmax(logits) # [N,4]

			# for loss and forward
			self.logits = logits
			self.yp = yp

	def build_loss(self):
		# logits -> [N,4]
		# y -> [N,4]
		losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,labels=tf.cast(self.y,"float")) # [N] # softmax cross entropy loss.
		#
		losses = tf.reduce_mean(losses) # scalar, avg loss of the whole batch

		tf.add_to_collection("losses",losses)

		# there might be l2 weight loss in some layer
		self.loss = tf.add_n(tf.get_collection("losses"),name="total_losses")
		#?
		#tf.summary.scalar(self.loss.op.name, self.loss)


	# givng a batch of data, construct the feed dict
	def get_feed_dict(self,batch,is_train=False):
		assert isinstance(batch,Dataset)
		# get the cap for each kind of step first
		config = self.config
		N = config.batch_size
		if config.showspecs:
			N = 2
		M = config.max_num_albums
		#JX = config.max_sent_title_size
		JXA = config.max_sent_album_title_size
		JXP = config.max_sent_photo_title_size
		JD = config.max_sent_des_size
		JQ = config.max_question_size
		JI = config.max_num_photos
		JT = config.max_when_size
		JG = config.max_where_size
		JA = config.max_answer_size

		VW = config.word_vocab_size
		VC = config.char_vocab_size
		d = config.hidden_size
		W = config.max_word_size

		# This could make training faster
		# so each minibatch 's max length is different
		if config.is_train: 
			new_JXA = max(len(title) for sample in batch.data['album_title'] for title in sample)
			new_JXP = max([len(title) for sample in batch.data['photo_titles'] for album in sample for title in album]+[0])
			if new_JXA == 0: # empty??
				new_JXA = 1
			if new_JXP == 0: # empty??
				new_JXP = 1
			#JX = min(JX,new_JX) # so JX should be the longest sentence  in the batch, but may not be the longest in the whole dataset
			JXA = min(JXA,new_JXA)
			JXP = min(JXP,new_JXP)

			new_JD = max(len(des) for sample in batch.data['album_description'] for des in sample)
			if new_JD == 0: # empty??
				new_JD = 1
			JD = min(JD,new_JD)

			new_JG = max(len(where) for sample in batch.data['where'] for where in sample)
			if new_JG == 0: # could be empty
				new_JG = 1
			JG = min(JG,new_JG)

			new_JT = max(len(when) for sample in batch.data['when'] for when in sample)
			if new_JT == 0: # empty??
				new_JT = 1
			JT = min(JT,new_JT)

			new_JI = max(len(album) for sample in batch.data['photo_ids'] for album in sample)
			if new_JI == 0: # empty??
				new_JI = 1
			JI = min(JI,new_JI)

			new_JQ = max(len(ques) for ques in batch.data['q'])
			if(new_JQ == 0):
				new_JQ = 1
			JQ = min(JQ,new_JQ)


			new_M = max(len(onesample) for onesample in batch.data['album_title'])
			if(new_M == 0):
				new_M = 1
			M = min(M,new_M)
		

		feed_dict = {}

		# initial all the placeholder
		# all words initial is 0 , means -NULL- token
		at = np.zeros([N,M,JXA],dtype='int32')
		at_c = np.zeros([N,M,JXA,W],dtype="int32")
		at_mask = np.zeros([N,M,JXA],dtype="bool")

		ad = np.zeros([N,M,JD],dtype='int32')
		ad_c = np.zeros([N,M,JD,W],dtype="int32")
		ad_mask = np.zeros([N,M,JD],dtype="bool")

		when = np.zeros([N,M,JT],dtype='int32')
		when_c = np.zeros([N,M,JT,W],dtype="int32")
		when_mask = np.zeros([N,M,JT],dtype="bool")

		where = np.zeros([N,M,JG],dtype='int32')
		where_c = np.zeros([N,M,JG,W],dtype="int32")
		where_mask = np.zeros([N,M,JG],dtype="bool")

		pts = np.zeros([N,M,JI,JXP],dtype="int32")
		pts_c = np.zeros([N,M,JI,JXP,W],dtype="int32")
		pts_mask = np.zeros([N,M,JI,JXP],dtype="bool")

		pis = np.zeros([N,M,JI],dtype='int32')
		pis_mask = np.zeros([N,M,JI],dtype="bool")

		q = np.zeros([N,JQ],dtype='int32')
		q_c = np.zeros([N,JQ,W],dtype="int32")
		q_mask = np.zeros([N,JQ],dtype="bool")

		choices = np.zeros([N,self.num_choice,JA],dtype='int32')
		choices_c = np.zeros([N,self.num_choice,JA,W],dtype="int32")
		choices_mask = np.zeros([N,self.num_choice,JA],dtype="bool")


		# link the feed_dict
		feed_dict[self.at] = at
		feed_dict[self.at_c] = at_c
		feed_dict[self.at_mask] = at_mask

		feed_dict[self.ad] = ad
		feed_dict[self.ad_c] = ad_c
		feed_dict[self.ad_mask] = ad_mask

		feed_dict[self.when] = when
		feed_dict[self.when_c] = when_c
		feed_dict[self.when_mask] = when_mask

		feed_dict[self.where] = where
		feed_dict[self.where_c] = where_c
		feed_dict[self.where_mask] = where_mask

		feed_dict[self.pts] = pts
		feed_dict[self.pts_c] = pts_c
		feed_dict[self.pts_mask] = pts_mask

		feed_dict[self.pis] = pis
		feed_dict[self.pis_mask] = pis_mask

		feed_dict[self.q] = q
		feed_dict[self.q_c] = q_c
		feed_dict[self.q_mask] = q_mask

		feed_dict[self.choices] = choices
		feed_dict[self.choices_c] = choices_c
		feed_dict[self.choices_mask] = choices_mask

		feed_dict[self.is_train] = is_train

		# image feat mat and word mat
		feed_dict[self.image_emb_mat] = batch.data['pidx2feat']
		feed_dict[self.existing_emb_mat] = batch.shared['existing_emb_mat']


		# question and choices
		Q = batch.data['q']
		Q_c = batch.data['cq']

		C = deepcopy(batch.data['cs']) # for the choice, since we will add correct answer into it, we copy so it won't affect other batch
		C_c = deepcopy(batch.data['ccs'])

		# data
		AT = batch.data['album_title']
		AT_c = batch.data['album_title_c']
		AD = batch.data['album_description']
		AD_c = batch.data['album_description_c']
		WHERE = batch.data['where']
		WHERE_c = batch.data['where_c']
		WHEN = batch.data['when']
		WHEN_c = batch.data['when_c']

		PT = batch.data['photo_titles']
		PT_c = batch.data['photo_titles_c']

		PI = batch.data['photo_idxs']

		# for training, one of the y will be in the choices
		# only training feed the y
		if is_train:
			Y = batch.data['y']
			Y_c = batch.data['cy']

			y = np.zeros([N,self.num_choice],dtype="bool")
			feed_dict[self.y] = y

			# decide the index of correct choice first, we randomly decide it
			correctIndex = np.random.choice(self.num_choice,N) # get a array of size [N]


			#for i in xrange(N): # some batch will be smaller
			for i in xrange(len(batch.data['y'])):
				y[i,correctIndex[i]] = True
				# put the answer into the choices
				assert len(C[i]) == (self.num_choice - 1)
				C[i].insert(correctIndex[i],Y[i])
				C_c[i].insert(correctIndex[i],Y_c[i])
				assert len(batch.data['cs'][i]) == (self.num_choice - 1)

			# for debug
			if config.showspecs:
				print "first two batch's answer:%s , char:%s, correctIdx:%s"%(Y[:2],Y_c[:2],y[:2])
				print "first two batch's choices:%s , char:%s"%(C[:2],C_c[:2])

		else:
			# for testing, put the answer into the original idx if there is any
			if(batch.data.has_key("y") and batch.data.has_key("cy") and batch.data.has_key('yidx')):
				Y = batch.data['y']
				Y_c = batch.data['cy']
				Y_idx = batch.data['yidx']
				#for i in xrange(N): # some batch will be smaller
				for i in xrange(len(batch.data['y'])):
					#print i,len(C[i])
					assert len(C[i]) == (self.num_choice - 1), ("C[i] len:%s,%s,Y:%s"%(len(C[i]),C[i],Y[i]))
					C[i].insert(Y_idx[i],Y[i])
					C_c[i].insert(Y_idx[i],Y_c[i])
					

			# will check choice num in the end

		# the photo idx is simple

		for i,pi in enumerate(PI):
			# one batch
			for j,pij in enumerate(pi):
				# one album
				if j == config.max_num_albums:
					break
				for k,pijk in enumerate(pij):
					if k == config.max_num_photos:
						break
					#print pijk
					assert isinstance(pijk,int)
					pis[i,j,k] = pijk
					pis_mask[i,j,k] = True


		def get_word(word):
			d = batch.shared['word2idx'] # this is for the word not in glove
			for each in (word, word.lower(), word.capitalize(), word.upper()):
				if each in d:
					return d[each]
			# the word in glove
			
			d2 = batch.shared['existing_word2idx']
			for each in (word, word.lower(), word.capitalize(), word.upper()):
				if each in d2:
					return d2[each] + len(d) # all idx + len(the word to train)
			return 1 # 1 is the -UNK-

		def get_char(char):
			d = batch.shared['char2idx']
			if char in d:
				return d[char]
			return 1

		# for all the text, get each word's index.
		# album title
		for i, ati in enumerate(AT): # batch_sizes
			# one batch
			for j,atij in enumerate(ati):
				# one album
				if j == config.max_num_albums:
					break
				for k,atijk in enumerate(atij):
					# each word
					if k == config.max_sent_album_title_size:
						break
					wordIdx = get_word(atijk)
					at[i,j,k] = wordIdx
					at_mask[i,j,k] = True

		for i, cati in enumerate(AT_c):
			# one batch
			for j, catij in enumerate(cati):
				if j == config.max_num_albums:
					break
				for k, catijk in enumerate(catij):
					# each word
					if k == config.max_sent_album_title_size:
						break
					for l,catijkl in enumerate(catijk):
						if l == config.max_word_size:
							break
						at_c[i,j,k,l] = get_char(catijkl)


		# album description
		for i, adi in enumerate(AD): # batch_sizes
			# one batch
			for j,adij in enumerate(adi):
				# one album
				if j == config.max_num_albums:
					break
				for k,adijk in enumerate(adij):
					# each word
					if k == config.max_sent_des_size:
						break
					wordIdx = get_word(adijk)
					ad[i,j,k] = wordIdx
					ad_mask[i,j,k] = True

		for i, cadi in enumerate(AD_c):
			# one batch
			for j, cadij in enumerate(cadi):
				if j == config.max_num_albums:
					break
				for k, cadijk in enumerate(cadij):
					# each word
					if k == config.max_sent_des_size:
						break
					for l,cadijkl in enumerate(cadijk):
						if l == config.max_word_size:
							break
						ad_c[i,j,k,l] = get_char(cadijkl)


		# album when
		for i, wi in enumerate(WHEN): # batch_sizes
			# one batch
			for j,wij in enumerate(wi):
				# one album
				if j == config.max_num_albums:
					break
				for k,wijk in enumerate(wij):
					# each word
					if k == config.max_when_size:
						break
					wordIdx = get_word(wijk)
					when[i,j,k] = wordIdx
					when_mask[i,j,k] = True

		for i, cwi in enumerate(WHEN_c):
			# one batch
			for j, cwij in enumerate(cwi):
				if j == config.max_num_albums:
					break
				for k, cwijk in enumerate(cwij):
					# each word
					if k == config.max_when_size:
						break
					for l,cwijkl in enumerate(cwijk):
						if l == config.max_word_size:
							break
						when_c[i,j,k,l] = get_char(cwijkl)

		# album where
		for i, wi in enumerate(WHERE): # batch_sizes
			# one batch
			for j,wij in enumerate(wi):
				# one album
				if j == config.max_num_albums:
					break
				for k,wijk in enumerate(wij):
					# each word
					if k == config.max_where_size:
						break
					wordIdx = get_word(wijk)
					where[i,j,k] = wordIdx
					where_mask[i,j,k] = True

		for i, cwi in enumerate(WHERE_c):
			# one batch
			for j, cwij in enumerate(cwi):
				if j == config.max_num_albums:
					break
				for k, cwijk in enumerate(cwij):
					# each word
					if k == config.max_where_size:
						break
					for l,cwijkl in enumerate(cwijk):
						if l == config.max_word_size:
							break
						where_c[i,j,k,l] = get_char(cwijkl)


		# photo title
		for i, pti in enumerate(PT): # batch_sizes
			# one batch
			for j,ptij in enumerate(pti):
				# one album
				if j == config.max_num_albums:
					break
				for k,ptijk in enumerate(ptij):
					# each photo
					if k == config.max_num_photos:
						break
					for l,ptijkl in enumerate(ptijk):
						if l == config.max_sent_photo_title_size:
							break
						# each word
						wordIdx = get_word(ptijkl)
						pts[i,j,k,l] = wordIdx
						pts_mask[i,j,k,l] = True
		for i, pti in enumerate(PT_c): # batch_sizes
			# one batch
			for j,ptij in enumerate(pti):
				# one album
				if j == config.max_num_albums:
					break
				for k,ptijk in enumerate(ptij):
					# each photo
					if k == config.max_num_photos:
						break
					for l,ptijkl in enumerate(ptijk):
						if l == config.max_sent_photo_title_size:
							break
						# each word
						for o, ptijklo in enumerate(ptijkl):
							# each char
							if o == config.max_word_size:
								break
							pts_c[i,j,k,l,o] = get_char(ptijklo)


		# answer choices

		for i,ci in enumerate(C):
			# one batch
			assert len(ci) == self.num_choice
			for j,cij in enumerate(ci):
				# one answer
				for k,cijk in enumerate(cij):
					# one word
					if k == config.max_answer_size:
						break
					wordIdx = get_word(cijk)
					choices[i,j,k] = wordIdx
					choices_mask[i,j,k] = True
		for i,ci in enumerate(C_c):
			# one batch
			assert len(ci) == self.num_choice, (len(ci))
			for j,cij in enumerate(ci):
				# one answer
				for k,cijk in enumerate(cij):
					# one word
					if k == config.max_answer_size:
						break
					for l,cijkl in enumerate(cijk):
						if l == config.max_word_size:
							break
						choices_c[i,j,k,l] = get_char(cijkl)


		# loa the question
		# no limiting on the question word length
		for i, qi in enumerate(Q):
			# one batch
			for j, qij in enumerate(qi):
				q[i, j] = get_word(qij)
				q_mask[i, j] = True

		# load the question char
		for i, cqi in enumerate(Q_c):
			for j, cqij in enumerate(cqi):
				for k, cqijk in enumerate(cqij):				
					if k == config.max_word_size:
						break
					q_c[i, j, k] = get_char(cqijk)
		#print feed_dict
		return feed_dict