-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_handler.py
82 lines (63 loc) · 2.12 KB
/
file_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import zipfile
import fnmatch
import csv
import pandas
def extract_file(path, zip_name):
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
zip_ref.extractall(path)
def filter_files(path, pattern):
all_files_name = os.listdir(path)
filtered_name = fnmatch.filter(all_files_name, '*{pattern}*'.format(pattern=pattern))
return filtered_name
def format_file_data(path, file_name):
file_path = path + file_name
if file_path.endswith('.txt'):
raw_data = read_text_file_data(file_path)
return format_text_file_data(raw_data)
elif file_path.endswith('.xlsx'):
raw_data = read_xlsx_file_data(file_path)
return format_xlsx_file_data(raw_data)
elif file_path.endswith('.tsv'):
raw_data = read_tsv_file_data(file_path)
return format_tsv_file_data(raw_data)
else:
raise Exception('Invalid file extension')
def read_text_file_data(file_path):
with open(file_path, 'r') as file_ref:
read_data = file_ref.read()
return read_data
def format_text_file_data(data):
rows = data.split('\n')
rows = list(filter(None, rows))
header = rows.pop(0).split(',')
header = list(map(str.strip, header))
formatted_rows = []
for row in rows:
row_dict = {}
row_data = row.split(',')
for idx, value in enumerate(row_data):
row_dict[header[idx]] = value
formatted_rows.append(row_dict)
return formatted_rows
def read_tsv_file_data(file_path):
with open(file_path) as file_ref:
rows = csv.reader(file_ref, delimiter="\t", quotechar='"')
rows = list(rows)
return rows
def format_tsv_file_data(data):
data = list(filter(None, data))
header = data.pop(0)
formatted_rows = []
for row in data:
row_dict = {}
for idx, value in enumerate(row):
row_dict[header[idx]] = value
formatted_rows.append(row_dict)
return formatted_rows
def read_xlsx_file_data(file_path):
data = pandas.read_excel(file_path)
return data
def format_xlsx_file_data(data):
formatted_rows = data.to_dict('records')
return formatted_rows