-
Notifications
You must be signed in to change notification settings - Fork 3
/
csv_split.py
104 lines (82 loc) · 3.28 KB
/
csv_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import os
import csv
def get_arguments():
"""Grab user supplied arguments using the argparse library."""
# Use arparse to get command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_file", required=True,
help="csv input file (with extension)", type=str)
parser.add_argument("-o", "--output_file", required=True,
help="csv output file (without extension)", type=str)
parser.add_argument("-r", "--row_limit", required=True,
help="row limit to split csv at", type=int)
args = parser.parse_args()
# Check if the input_file exits
is_valid_file(parser, args.input_file)
# Check if the input_file is valid
is_valid_csv(parser, args.input_file, args.row_limit)
return args.input_file, args.output_file, args.row_limit
def is_valid_file(parser, file_name):
"""Ensure that the input_file exists."""
if not os.path.exists(file_name):
parser.error("The file '{}' does not exist!".format(file_name))
sys.exit(1)
def is_valid_csv(parser, file_name, row_limit):
"""
Ensure that the # of rows in the input_file
is greater than the row_limit.
"""
row_count = 0
for row in csv.reader(open(file_name)):
row_count += 1
# Note: You could also use a generator expression
# and the sum() function to count the rows:
# row_count = sum(1 for row in csv.reader(open(file_name)))
if row_limit > row_count:
parser.error(
"The 'row_count' of '{}' is > the number of rows in '{}'!"
.format(row_limit, file_name)
)
sys.exit(1)
def parse_file(arguments):
"""
Splits the CSV into multiple files or chunks based on the row_limit.
Then create new CSV files.
"""
input_file = arguments[0]
output_file = arguments[1]
row_limit = arguments[2]
output_path = '.' # Current directory
# Read CSV, split into list of lists
with open(input_file, 'r') as input_csv:
datareader = csv.reader(input_csv)
all_rows = []
for row in datareader:
all_rows.append(row)
# Remove header
header = all_rows.pop(0)
# Split list of list into chunks
current_chunk = 1
for i in range(0, len(all_rows), row_limit): # Loop through list
chunk = all_rows[i:i + row_limit] # Create single chunk
current_output = os.path.join( # Create new output file
output_path,
"{}-{}.csv".format(output_file, current_chunk)
)
# Add header
chunk.insert(0, header)
# Write chunk to output file
with open(current_output, 'w') as output_csv:
writer = csv.writer(output_csv)
writer = writer.writerows(chunk)
# Output info
print("")
print("Chunk # {}:".format(current_chunk))
print("Filepath: {}".format(current_output))
print("# of rows: {}".format(len(chunk)))
# Create new chunk
current_chunk += 1
if __name__ == "__main__":
arguments = get_arguments()
parse_file(arguments)