-
Notifications
You must be signed in to change notification settings - Fork 0
/
populate_pdf_video_paths.py
186 lines (162 loc) · 6.65 KB
/
populate_pdf_video_paths.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
ADASS authors can upload a PDF and/or an MP4 file in their contribution
directory (named after their contribution PID) in out FTP site.
We have added two new columns in the database to keep track of these files and
we want to periodically update these. Also, we publish a conference schedule
and HTML pages for all contributions. We want to add and update links to the
PFD and MP4 in there as well.
"""
import argparse
import logging
from pathlib import Path
from bs4 import BeautifulSoup, Tag
import psycopg2
# Configuration
FTP_ROOT = '/var/www/html/static/ftp'
HTML_ROOT = '/var/www/schedule/adass2020/talk'
MEDIA_ROOT = '/var/www/schedule/media'
MEDIA_URL_ROOT = 'https:\/\/adass2020.es/static/ftp'
SQL = '''\
SELECT
code,
paper_id,
pdf_path,
video_path
FROM
submission_submission
WHERE
state = 'confirmed'
ORDER BY
paper_id
'''
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--ftp_root', type=str, default=FTP_ROOT)
parser.add_argument('--html_root', type=str, default=HTML_ROOT)
parser.add_argument('--media_root', type=str, default=MEDIA_ROOT)
parser.add_argument('--media_url_root', type=str, default=MEDIA_URL_ROOT)
args = parser.parse_args()
ftp_root = Path(args.ftp_root)
html_root = Path(args.html_root)
media_root = Path(args.media_root)
media_url_root = Path(args.media_url_root)
contribution_files = {}
with psycopg2.connect(database="pretalx", user="pretalx", password="",
host="localhost", port="5432") as conn:
cur = conn.cursor()
cur.execute(SQL)
for row in cur:
(code, paper_id, pdf_path, video_path) = row
if paper_id is None:
logging.warning(f'Talk code {code} has no paper_id!')
continue
ftp_path = ftp_root / paper_id
if not ftp_path.is_dir():
logging.warning(f'Directory {ftp_path} MISSING')
continue
index_path = html_root / code / 'index.html'
if not index_path.is_file():
logging.warning(f'HTML file {index_path} MISSING')
continue
# Understand which pdf/mp4 file to use: probably the most recemt.
files = sorted(
[p for p in ftp_path.iterdir()
if p.suffix.lower() in ('.pdf', '.mp4')
and paper_id in p.name],
key=lambda path: path.stat().st_mtime
)
if not files:
# See if the PDF/MP4 was in the DB and just disappeared
if pdf_path or video_path:
logging.warning(f'!!!!! {code}: PDF/MP4 DISAPPEARED!!!!!!')
# else:
# logging.warning(f'{code} has not uploaded a PDF/MP4')
continue
# Find the newest PDF & MP4. Important that files is sorted in
# oldest -> newest since we remove elements from the back.
newest_pdf = None
newest_video = None
while files:
path = files.pop()
ext = path.suffix.lower()
if ext == '.pdf' and not newest_pdf:
newest_pdf = path
elif ext == '.mp4' and not newest_video:
newest_video = path
if newest_video and newest_pdf:
break
# We store the file url, not the absolute path
# Also, make sure that we can actually serve the files via HTTP(S)
if newest_pdf:
newest_pdf.chmod(0o644)
newest_pdf = media_url_root / paper_id / newest_pdf.name
if newest_video:
newest_video.chmod(0o644)
newest_video = media_url_root / paper_id / newest_video.name
# if str(newest_pdf) == str(pdf_path) and \
# str(newest_video) == str(video_path):
# # Nothing to do!
# logging.warning(f'{code} is already up to date! SKIPPED')
# continue
d = {'pdf_path': newest_pdf, 'video_path': newest_video}
# Update the database
# FIXME: do all these updates in a single transaction!
with conn.cursor() as update_cur:
update = 'UPDATE submission_submission SET'
where = f"WHERE code = '{code}'"
values = ', '.join(
f"{k} = '{v}'" for k, v in d.items() if v is not None
)
sql = f'{update} {values} {where}'
update_cur.execute(sql)
with open(index_path) as f:
data = f.read()
soup = BeautifulSoup(data, 'html.parser')
aside = soup.find('aside')
if not aside or not isinstance(aside, Tag):
logging.warning(f'!!!! {code}: malformed HTML!!!!')
continue
existing = aside.find('section', 'resources')
start = '''\
<section class="resources">
<div class="speaker-header">
<strong>From the FTP</strong>
</div>
<div>
'''
links = ' | '.join(
f' <a href="{v}">{v.name}</a>' for v in d.values()
if v is not None
)
end = '''
</div>
</section>
'''
html = f'{start}{links}{end}'
new_tag = BeautifulSoup(html, 'html.parser').section
if not existing:
aside.contents.append(new_tag)
else:
existing.replace_with(new_tag)
# FIXME: this is very hackish!
# Is the HTML is about a poster (meaning something "scheduled" in
# the Posters room), remove the fake schedule info as well as the
# iCal file.
h3 = soup.find('h3', 'talk-title')
# We have two cases here: either h3.small is a simple tag with
# text inside (the scheduling info) or is a compound tag with the
# scheduling info as well as some i tags with the do not record
# icon etc. In this second case, the first element in the tag
# contents in the actual string.
if h3.small.string and h3.small.string.strip().endswith('Posters'):
h3.small.decompose()
h3.div.a.decompose()
elif h3.small.string is None:
txt = h3.small.contents[0]
if txt.strip().endswith('Posters'):
h3.small.decompose()
h3.div.a.decompose()
with open(index_path, 'w') as f:
f.write(soup.prettify())
logging.warning(f'{code} was updated')
# print(soup.prettify())