-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmysqldump_to_csv.py
executable file
·125 lines (102 loc) · 4.03 KB
/
mysqldump_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
import fileinput
import csv
import sys
import re
from signal import signal, SIGPIPE, SIG_DFL
# Prevent prematurely closed pipes from raising an exception in Python
signal(SIGPIPE, SIG_DFL)
# Allow large content in the dump
csv.field_size_limit(sys.maxsize)
# List of required columns and their positions in the SQL data
REQUIRED_FIELDS = {
"LOGIN": 2,
"NAME": 6,
"LAST_NAME": 7,
"EMAIL": 8,
"PERSONAL_BIRTHDATE": 16,
"TIMESTAMP_X": 1,
"PERSONAL_PHONE": 18,
"PERSONAL_MOBILE": 20,
"PERSONAL_MAILBOX": 23,
"PERSONAL_CITY": 24,
"PERSONAL_STATE": 25,
"WORK_CITY": 37,
"PASSWORD": 3,
"CHECKWORD": 4,
}
LINES_PER_FILE = 10000 # Maximum lines per CSV file
def is_insert(line):
"""Checks if the line starts a SQL INSERT INTO statement."""
return line.strip().startswith('INSERT INTO')
def get_values(statement):
"""Extracts the values portion of an INSERT INTO statement."""
_, _, values = statement.partition(' VALUES ')
return values.strip().rstrip(';')
def parse_values(values, writer, line_counter):
"""Parses the values and writes them to the CSV writer."""
rows = re.split(r"\),\s*\(", values.strip("()"))
for row in rows:
# Extract values (handle quoted strings, NULLs, and numbers)
columns = re.findall(r"(?:'([^']*)'|NULL|\d+)", row)
# Map values to the required fields
result_row = [
columns[index] if index < len(columns) and columns[index] != "NULL" else ""
for field, index in REQUIRED_FIELDS.items()
]
writer.writerow(result_row)
line_counter[0] += 1
# Check if we need to switch to a new file
if line_counter[0] >= LINES_PER_FILE:
line_counter[0] = 0
return True # Signal to switch files
return False
def process_insert(statement, writer, line_counter):
"""Processes a single INSERT INTO statement."""
try:
values = get_values(statement)
return parse_values(values, writer, line_counter)
except Exception as e:
print(f"Error processing statement: {e}", file=sys.stderr)
return False
def main():
"""Main function to process SQL dump and split into multiple CSV files."""
buffer = ""
file_count = 0
line_counter = [0] # Track lines written to current file
try:
current_file = open(f"apteka_{file_count}.csv", "w", newline="", encoding="utf-8")
writer = csv.writer(current_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow(REQUIRED_FIELDS.keys()) # Write header once per file
for line in fileinput.input():
line = line.strip()
if is_insert(line):
if buffer:
if process_insert(buffer, writer, line_counter):
current_file.close()
file_count += 1
current_file = open(f"apteka_{file_count}.csv", "w", newline="", encoding="utf-8")
writer = csv.writer(current_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow(REQUIRED_FIELDS.keys()) # Write header for new file
buffer = ""
buffer = line
elif buffer:
buffer += " " + line
if line.endswith(";"):
if process_insert(buffer, writer, line_counter):
current_file.close()
file_count += 1
current_file = open(f"apteka_{file_count}.csv", "w", newline="", encoding="utf-8")
writer = csv.writer(current_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow(REQUIRED_FIELDS.keys()) # Write header for new file
buffer = ""
if buffer:
process_insert(buffer, writer, line_counter)
current_file.close()
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
print(f"Unhandled exception: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()