forked from tiyd-python-2015-05/django-movies
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_ml_1m_data.py
113 lines (100 loc) · 3.62 KB
/
convert_ml_1m_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import csv
import json
import datetime
print("Converting users...")
users = []
with open("data/ml-1m/users.dat") as infile:
reader = csv.reader((line.replace("::", ";") for line in infile),
delimiter=";")
for row in reader:
users.append({"model": "pymdb.Rater",
"pk": row[0],
"fields": {
"gender": row[1],
"age": row[2],
"job": row[3],
"zip_code": row[4],
}})
with open("movieratings/fixtures/users.json", "w") as outfile:
outfile.write(json.dumps(users))
genres_list = [
"Action",
"Adventure",
"Animation",
"Children's",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western"]
genre_dict = dict(zip(genres_list, range(20)))
print("Converting genres...")
genres = []
for genre in genres_list:
genres.append({"model": "pymdb.Genre",
"pk": genre_dict[genre],
"fields": {
# "id": genre_dict[genre],
"name": genre,
}})
with open("movieratings/fixtures/genres.json", "w") as outfile:
outfile.write(json.dumps(genres))
print("Converting movies...")
movies = []
with open("data/ml-1m/movies.dat", encoding="windows-1252") as infile:
reader = csv.reader((line.replace("::", "_") for line in infile),
delimiter="_")
for row in reader:
# print('0:', row[0], '1:', row[1], '2:', row[2])
movies.append({"model": "pymdb.Movie",
"pk": row[0],
"fields": {
"title": row[1],
"genre": [genre_dict[genre] for genre in row[2].split('|')],
}})
with open("movieratings/fixtures/movies.json", "w") as outfile:
outfile.write(json.dumps(movies))
print("Converting ratings...")
ratings = []
with open("data/ml-1m/ratings.dat") as infile:
reader = csv.reader((line.replace("::", ";") for line in infile),
delimiter=";")
for idx, row in enumerate(reader):
ratings.append({"model": "pymdb.Rating",
"pk": idx + 1,
"fields": {
"rater": row[0],
"movie": row[1],
"rating": row[2],
#"time": row[3],
"time_added": str(datetime.datetime.fromtimestamp(int(row[3]), tz=datetime.timezone.utc))
#tzinfo
}})
with open("movieratings/fixtures/ratings.json", "w") as outfile:
outfile.write(json.dumps(ratings))
# print("Converting movies...")
# with open("data/ml-1m/movies.dat", encoding="windows-1252") as infile:
# reader = csv.reader((line.replace("::", ";") for line in infile),
# delimiter=";")
# with open("data/ml-1m/movies.csv", "w", newline="") as outfile:
# writer = csv.writer(outfile)
# for row in reader:
# writer.writerow(row[0:2])
#
# print("Converting ratings...")
# with open("data/ml-1m/ratings.dat") as infile:
# reader = csv.reader((line.replace("::", ";") for line in infile),
# delimiter=";")
# with open("data/ml-1m/ratings.csv", "w", newline="") as outfile:
# writer = csv.writer(outfile)
# for row in reader:
# writer.writerow(row)