-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathdata_loader.py
131 lines (100 loc) · 5.06 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import tensorflow as tf
from IPython.core import debugger
debug = debugger.Pdb().set_trace
def logodds(p):
return np.log(p/(1.-p))
class DataLoader(object):
'''This loads the image and the labels through a tensorflow queue.
All of the labels are loaded regardless of what is specified in graph,
because this model is gpu throttled anyway so there shouldn't be any
overhead
For multiple gpu, the strategy here is to have 1 queue with 2xbatch_size
then use tf.split within trainer.train()
'''
def __init__(self,label_names,config):
self.label_names=label_names
self.config=config
self.scale_size=config.input_scale_size
#self.data_format=config.data_format
self.split=config.split
self.do_shuffle=config.do_shuffle
self.num_worker=config.num_worker
self.is_crop=config.is_crop
self.is_grayscale=config.grayscale
attr_file= glob("{}/*.{}".format(config.data_path, 'txt'))[0]
setattr(config,'attr_file',attr_file)
attributes = pd.read_csv(config.attr_file,delim_whitespace=True) #+-1
#Store all labels for reference
self.all_attr= 0.5*(attributes+1)# attributes is {0,1}
self.all_label_means=self.all_attr.mean()
#but only return desired labels in queues
self.attr=self.all_attr[label_names]
self.label_means=self.attr.mean()# attributes is 0,1
self.image_dir=os.path.join(config.data_path,'images')
self.filenames=[os.path.join(self.image_dir,j) for j in self.attr.index]
self.num_examples_per_epoch=len(self.filenames)
self.min_fraction_of_examples_in_queue=0.001#go faster during debug
#self.min_fraction_of_examples_in_queue=0.01
self.min_queue_examples=int(self.num_examples_per_epoch*self.min_fraction_of_examples_in_queue)
def get_label_queue(self,batch_size):
tf_labels = tf.convert_to_tensor(self.attr.values, dtype=tf.uint8)#0,1
with tf.name_scope('label_queue'):
uint_label=tf.train.slice_input_producer([tf_labels])[0]
label=tf.to_float(uint_label)
#All labels, not just those in causal_model
dict_data={sl:tl for sl,tl in
zip(self.label_names,tf.split(label,len(self.label_names)))}
num_preprocess_threads = max(self.num_worker-3,1)
data_batch = tf.train.shuffle_batch(
dict_data,
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=self.min_queue_examples + 3 * batch_size,
min_after_dequeue=self.min_queue_examples,
)
return data_batch
def get_data_queue(self,batch_size):
image_files = tf.convert_to_tensor(self.filenames, dtype=tf.string)
tf_labels = tf.convert_to_tensor(self.attr.values, dtype=tf.uint8)
with tf.name_scope('filename_queue'):
#must be list
str_queue=tf.train.slice_input_producer([image_files,tf_labels])
img_filename, uint_label= str_queue
img_contents=tf.read_file(img_filename)
image = tf.image.decode_jpeg(img_contents, channels=3)
image=tf.cast(image,dtype=tf.float32)
if self.config.is_crop:#use dcgan cropping
#dcgan center-crops input to 108x108, outputs 64x64 #centrally crops it #We emulate that here
image=tf.image.resize_image_with_crop_or_pad(image,108,108)
#image=tf.image.resize_bilinear(image,[scale_size,scale_size])#must be 4D
resize_method=getattr(tf.image.ResizeMethod,self.config.resize_method)
image=tf.image.resize_images(image,[self.scale_size,self.scale_size],
method=resize_method)
#Some dataset enlargement. Might as well.
image=tf.image.random_flip_left_right(image)
##carpedm-began crops to 128x128 starting at (50,25), then resizes to 64x64
#image=tf.image.crop_to_bounding_box(image, 50, 25, 128, 128)
#image=tf.image.resize_nearest_neighbor(image, [scale_size, scale_size])
tf.summary.image('real_image',tf.expand_dims(image,0))
label=tf.to_float(uint_label)
#Creates a dictionary {'Male',male_tensor, 'Young',young_tensor} etc..
dict_data={sl:tl for sl,tl in
zip(self.label_names,tf.split(label,len(self.label_names)))}
assert not 'x' in dict_data.keys()#don't have a label named "x"
dict_data['x']=image
print ('Filling queue with %d Celeb images before starting to train. '
'I don\'t know how long this will take' % self.min_queue_examples)
num_preprocess_threads = max(self.num_worker,1)
data_batch = tf.train.shuffle_batch(
dict_data,
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=self.min_queue_examples + 3 * batch_size,
min_after_dequeue=self.min_queue_examples,
)
return data_batch