diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d7110fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/Include +/Lib +/Scripts +/tcl +/datasets +/logdir/train/* +pip-selfcheck.json +__pycache__ \ No newline at end of file diff --git a/samplernn/audio_reader.py b/samplernn/audio_reader.py index 2f99aa7..3cc5d3b 100644 --- a/samplernn/audio_reader.py +++ b/samplernn/audio_reader.py @@ -11,10 +11,10 @@ import tensorflow as tf def randomize_files(files): - files_idx= [i for i in xrange(len(files))] + files_idx= [i for i in range(len(files))] random.shuffle(files_idx) - for idx in xrange(len(files)): + for idx in range(len(files)): yield files[files_idx[idx]] def find_files(directory, pattern='*.wav'): @@ -97,7 +97,7 @@ def thread_main(self, sess): np.full((pad_elements, 1), 0.0, dtype='float32')], axis=0) if self.sample_size: - while len(audio) > self.sample_size: + while len(audio) >= self.sample_size: piece = audio[:self.sample_size, :] sess.run(self.enqueue, feed_dict={self.sample_placeholder: piece}) diff --git a/samplernn/model.py b/samplernn/model.py index d91fe5d..facb5a7 100644 --- a/samplernn/model.py +++ b/samplernn/model.py @@ -53,7 +53,7 @@ def _create_network_BigFrame(self, with tf.variable_scope('BigFrame_layer'): big_input_frames = tf.reshape(big_input_sequences,[ tf.shape(big_input_sequences)[0], - tf.shape(big_input_sequences)[1] / self.big_frame_size, + tf.shape(big_input_sequences)[1] // self.big_frame_size, self.big_frame_size]) big_input_frames = (big_input_frames / self.q_levels/2.0) - 1.0 big_input_frames *= 2.0 @@ -71,7 +71,7 @@ def _create_network_BigFrame(self, big_frame_outputs = tf.transpose(big_frame_outputs, perm=[1, 0, 2]) big_frame_outputs = tf.reshape(big_frame_outputs, [tf.shape(big_frame_outputs)[0], - tf.shape(big_frame_outputs)[1] * self.big_frame_size/self.frame_size, + tf.shape(big_frame_outputs)[1] * self.big_frame_size // self.frame_size, -1]) return big_frame_outputs,final_big_frame_state def _create_network_Frame(self, @@ -82,7 +82,7 @@ def _create_network_Frame(self, with tf.variable_scope('Frame_layer'): input_frames = tf.reshape(input_sequences,[ tf.shape(input_sequences)[0], - tf.shape(input_sequences)[1] / self.frame_size, + tf.shape(input_sequences)[1] // self.frame_size, self.frame_size]) input_frames = (input_frames / self.q_levels/2.0) - 1.0 input_frames *= 2.0 @@ -146,7 +146,7 @@ def _create_network_Sample(self, out = math_ops.matmul(out, sample_mlp2_weights) out = tf.nn.relu(out) out = math_ops.matmul(out, sample_mlp3_weights) - out = tf.reshape(out, [-1, sample_shap[1]/self.emb_size - 1, self.q_levels]) + out = tf.reshape(out, [-1, sample_shap[1] // self.emb_size - 1, self.q_levels]) return out def _create_network_SampleRnn(self, train_big_frame_state, @@ -155,7 +155,7 @@ def _create_network_SampleRnn(self, #big frame big_input_sequences = tf.cast(self.encoded_input_rnn, tf.float32)\ [:,:-self.big_frame_size,:] - big_frame_num_steps = (self.seq_len-self.big_frame_size)/self.big_frame_size + big_frame_num_steps = (self.seq_len-self.big_frame_size) // self.big_frame_size big_frame_outputs,\ final_big_frame_state = \ self._create_network_BigFrame(num_steps = big_frame_num_steps, @@ -164,7 +164,7 @@ def _create_network_SampleRnn(self, #frame input_sequences = tf.cast(self.encoded_input_rnn, tf.float32)[:, self.big_frame_size-self.frame_size:-self.frame_size, :] - frame_num_steps = (self.seq_len-self.big_frame_size)/self.frame_size + frame_num_steps = (self.seq_len-self.big_frame_size) // self.frame_size frame_outputs, final_frame_state = \ self._create_network_Frame(num_steps = frame_num_steps, big_frame_outputs = big_frame_outputs, diff --git a/train.py b/train.py index f82179e..671cf35 100644 --- a/train.py +++ b/train.py @@ -14,18 +14,19 @@ DATA_DIRECTORY = './pinao-corpus' LOGDIR_ROOT = './logdir' -CHECKPOINT_EVERY = 20 +CHECKPOINT_EVERY = 5 +GENERATE_EVERY = 10 NUM_STEPS = int(1e5) LEARNING_RATE = 1e-3 SAMPLE_SIZE = 100000 L2_REGULARIZATION_STRENGTH = 0 -SILENCE_THRESHOLD = 0.3 +SILENCE_THRESHOLD = None MOMENTUM = 0.9 MAX_TO_KEEP = 5 N_SECS = 3 -SAMPLE_RATE = 16000 -LENGTH = N_SECS*SAMPLE_RATE +SAMPLE_RATE = 22050 +LENGTH = N_SECS * SAMPLE_RATE BATCH_SIZE = 1 NUM_GPU = 1 @@ -51,7 +52,7 @@ def get_arguments(): parser.add_argument('--frame_size', type=int, required=True) parser.add_argument('--q_levels', type=int, required=True) parser.add_argument('--dim', type=int, required=True) - parser.add_argument('--n_rnn', type=int, choices=xrange(1,6), required=True) + parser.add_argument('--n_rnn', type=int, choices=list(range(1,6)), required=True) parser.add_argument('--emb_size', type=int, required=True) parser.add_argument('--rnn_type', choices=['LSTM', 'GRU'], required=True) parser.add_argument('--max_checkpoints', type=int, default=MAX_TO_KEEP) @@ -212,7 +213,7 @@ def generate_and_save_samples(step, net, infe_para, sess): big_frame_out = None frame_out = None sample_out = None - for t in xrange(net.big_frame_size, LENGTH): + for t in range(net.big_frame_size, LENGTH): #big frame if t % net.big_frame_size == 0: big_frame_out = None @@ -226,14 +227,18 @@ def generate_and_save_samples(step, net, infe_para, sess): #frame if t % net.frame_size == 0: frame_input_sequences = samples[:, t-net.frame_size:t,:].astype('float32') - big_frame_output_idx = (t/net.frame_size)%(net.big_frame_size/net.frame_size) - frame_out, final_s= \ - sess.run([infe_para['infe_frame_outp'], - infe_para['infe_final_frame_state']], - feed_dict={ - infe_para['infe_big_frame_outp_slices'] : big_frame_out[:,[big_frame_output_idx],:], - infe_para['infe_frame_inp'] : frame_input_sequences, - infe_para['infe_frame_state'] : final_s}) + big_frame_output_idx = (t // net.frame_size) % (net.big_frame_size // net.frame_size) + frame_out, final_s = sess.run( + [ + infe_para['infe_frame_outp'], + infe_para['infe_final_frame_state'] + ], + feed_dict={ + infe_para['infe_big_frame_outp_slices'] : big_frame_out[:,[big_frame_output_idx],:], + infe_para['infe_frame_inp'] : frame_input_sequences, + infe_para['infe_frame_state'] : final_s + } + ) #sample sample_input_sequences = samples[:, t-net.frame_size:t,:] frame_output_idx = t%net.frame_size @@ -248,11 +253,11 @@ def generate_and_save_samples(step, net, infe_para, sess): np.arange(net.q_levels), p=row ) sample_next_list.append(sample_next) samples[:, t] = np.array(sample_next_list).reshape([-1,1]) - for i in range(0, net.batch_size): + for i in range(net.batch_size): inp = samples[i].reshape([-1,1]).tolist() out = sess.run(infe_para['infe_decode'], feed_dict={infe_para['infe_sample_decode_inp']: inp}) - write_wav(out, 16000, './test_'+str(step)+'_'+str(i)+'.wav') + write_wav(out, SAMPLE_RATE, './generated/test_'+str(step)+'_'+str(i)+'.wav') if i >= 10: break @@ -284,7 +289,7 @@ def main(): train_frame_state = [] final_big_frame_state = [] final_frame_state = [] - for i in xrange(args.num_gpus): + for i in range(args.num_gpus): train_input_batch_rnn.append(tf.Variable( tf.zeros([net.batch_size, net.seq_len,1]), trainable=False ,name="input_batch_rnn", dtype=tf.float32)) train_big_frame_state.append(net.big_cell.zero_state(net.batch_size, tf.float32)) @@ -292,7 +297,7 @@ def main(): train_frame_state.append (net.cell.zero_state(net.batch_size, tf.float32)) final_frame_state.append (net.cell.zero_state(net.batch_size, tf.float32)) with tf.variable_scope(tf.get_variable_scope()): - for i in xrange(args.num_gpus): + for i in range(args.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('TOWER_%d' % (i)) as scope: # Create model. @@ -308,16 +313,17 @@ def main(): losses.append(loss) # Reuse variables for the next tower. trainable = tf.trainable_variables() - gradients = optim.compute_gradients(loss,trainable,\ - aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) + gradients = optim.compute_gradients(loss,trainable, \ + aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) tower_grads.append(gradients) grad_vars = average_gradients(tower_grads) - grads, vars = zip(*grad_vars) + grads, vars = list(zip(*grad_vars)) grads_clipped, _ = tf.clip_by_global_norm(grads, 5.0) - grad_vars = zip(grads_clipped, vars) + grad_vars = list(zip(grads_clipped, vars)) for name in grad_vars: - print(name) + print(name) + apply_gradient_op = optim.apply_gradients(grad_vars, global_step=global_step) ################# infe_para = create_gen_wav_para(net) @@ -357,21 +363,21 @@ def main(): last_saved_step = saved_global_step try: for step in range(saved_global_step + 1, args.num_steps): - if (step-1) % 20 == 0 and step>20: + if (step-1) % GENERATE_EVERY == 0 and step > GENERATE_EVERY: generate_and_save_samples(step,net, infe_para, sess) final_big_s = [] final_s = [] - for g in xrange(args.num_gpus): + for g in range(args.num_gpus): final_big_s.append(sess.run(net.big_initial_state)) final_s.append(sess.run(net.initial_state)) start_time = time.time() - inputslist = [sess.run(audio_batch) for i in xrange(args.num_gpus)] + inputslist = [sess.run(audio_batch) for i in range(args.num_gpus)] loss_sum = 0; idx_begin=0 audio_length = args.sample_size - args.big_frame_size bptt_length = args.seq_len - args.big_frame_size - stateful_rnn_length = audio_length/bptt_length + stateful_rnn_length = audio_length // bptt_length outp_list=[summaries,\ losses, \ apply_gradient_op, \ @@ -379,7 +385,7 @@ def main(): final_frame_state] for i in range(0, stateful_rnn_length): inp_dict={} - for g in xrange(args.num_gpus): + for g in range(args.num_gpus): inp_dict[train_input_batch_rnn[g]] = \ inputslist[g][:, idx_begin: idx_begin+args.seq_len,:] inp_dict[train_big_frame_state[g]] = final_big_s[g] @@ -389,7 +395,7 @@ def main(): summary, loss_gpus,_, final_big_s, final_s= \ sess.run(outp_list, feed_dict=inp_dict) writer.add_summary(summary, step) - for g in xrange(args.num_gpus): + for g in range(args.num_gpus): loss_gpu = loss_gpus[g]/stateful_rnn_length loss_sum += loss_gpu/args.num_gpus duration = time.time() - start_time