forked from microsoft/DNS-Challenge
-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathsplit_dns_corpus.py
110 lines (93 loc) · 3.89 KB
/
split_dns_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script contains a function to seperate the DNS-Challenge data set in a
training and validation set.
Author: Nils L. Westhausen ([email protected])
Version: 13.05.2020
This code is licensed under the terms of the MIT-license.
"""
import os
from random import seed, sample
from shutil import move
def create_corpus(path, path_new, percent_of_train_material=80):
'''
Function to divide a data set.
Parameters
----------
path : STRING
Path to dataset directory with three sub directories (noisy,noise,clean).
path_new : STRING
Target directory for the divided data set.
percent_of_train_material : INT
Percentage of training material (Default = 80%).
Returns
-------
None.
'''
# set seed
seed(42)
# create list for folder names. Change if you have other names as the default
# mix noise speech
names = ["noisy","noise","clean"]
# create paths to source files
path_to_noise = os.path.join(path, names[1])
path_to_noisy = os.path.join(path, names[0])
path_to_clean = os.path.join(path, names[2])
# create paths to training folders
path_to_noise_train = os.path.join(path_new,'train' ,names[1])
path_to_noisy_train = os.path.join(path_new,'train', names[0])
path_to_clean_train = os.path.join(path_new,'train', names[2])
# create paths to validation folders
path_to_noise_val = os.path.join(path_new,'val' ,names[1])
path_to_noisy_val = os.path.join(path_new,'val', names[0])
path_to_clean_val = os.path.join(path_new,'val', names[2])
# create directories if not existent
if not os.path.exists(path_to_noise_train):
os.makedirs(path_to_noise_train)
if not os.path.exists(path_to_noisy_train):
os.makedirs(path_to_noisy_train)
if not os.path.exists(path_to_noise_val):
os.makedirs(path_to_noise_val)
if not os.path.exists(path_to_noisy_val):
os.makedirs(path_to_noisy_val)
if not os.path.exists(path_to_clean_val):
os.makedirs(path_to_clean_val)
if not os.path.exists(path_to_clean_train):
os.makedirs(path_to_clean_train)
# get the filenames
file_names = os.listdir(path_to_noisy)
# split file names in training and validation
train_names = sample(file_names, int(percent_of_train_material/100*len(file_names)))
val_names = file_names
for file in train_names:
val_names.remove(file)
# copy files to training folder
for file in train_names:
source_file_noisy = os.path.join(path_to_noisy, file)
source_file_noise = os.path.join(path_to_noise, file)
source_file_clean = os.path.join(path_to_clean, file)
dest_file_noisy = os.path.join(path_to_noisy_train, file)
dest_file_noise = os.path.join(path_to_noise_train, file)
dest_file_clean = os.path.join(path_to_clean_train, file)
move(source_file_noisy, dest_file_noisy)
move(source_file_noise, dest_file_noise)
move(source_file_clean, dest_file_clean)
# copy files to validation folder
for file in val_names:
source_file_noisy = os.path.join(path_to_noisy, file)
source_file_noise = os.path.join(path_to_noise, file)
source_file_clean = os.path.join(path_to_clean, file)
dest_file_noisy = os.path.join(path_to_noisy_val, file)
dest_file_noise = os.path.join(path_to_noise_val, file)
dest_file_clean = os.path.join(path_to_clean_val, file)
move(source_file_noisy, dest_file_noisy)
move(source_file_noise, dest_file_noise)
move(source_file_clean, dest_file_clean)
# remove empty directories
os.rmdir(path_to_noisy)
os.rmdir(path_to_noise)
os.rmdir(path_to_clean)
print('Data set divided sucessfully.')
if __name__ == '__main__':
create_corpus('./training_set','./training_set')