-
Notifications
You must be signed in to change notification settings - Fork 43
/
main.py
353 lines (263 loc) · 14.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
import asyncio
from contextlib import closing
import concurrent.futures
import io
import json
import logging
import os
import re
import subprocess
from tempfile import NamedTemporaryFile
import time
from uuid import uuid4
from urllib.parse import urlparse
import aiobotocore
import boto3
from uriutils import uri_read, uri_exists, uri_dump
from pageutils import invoke_textract_ocr
from utils import get_subprocess_output
LAMBDA_TASK_ROOT = os.environ.get('LAMBDA_TASK_ROOT', os.path.dirname(os.path.abspath(__file__)))
LAMBDA_FUNCTION_NAME = os.environ['LAMBDA_FUNCTION_NAME']
BIN_DIR = os.path.join(LAMBDA_TASK_ROOT, 'bin')
LIB_DIR = os.path.join(LAMBDA_TASK_ROOT, 'lib')
MERGE_SEARCHABLE_PDF_DURATION = float(os.environ.get('MERGE_SEARCHABLE_PDF_DURATION', 90))
RETURN_RESULTS_DURATION = float(os.environ.get('RETURN_RESULTS_DURATION', 3.0))
TEXTRACT_OUTPUT_WAIT_BUFFER_TIME = float(os.environ.get('TEXTRACT_OUTPUT_WAIT_BUFFER_TIME', 5.0))
lambda_client = boto3.client('lambda')
logging.basicConfig(format='%(asctime)-15s [%(name)s-%(process)d] %(levelname)s: %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
def handle(event, context):
global logger
document_uri = event['document_uri']
temp_uri_prefix = event.get('temp_uri_prefix', event['document_uri'] + '-temp')
text_uri = event.get('text_uri', document_uri + '.txt')
searchable_pdf_uri = event.get('searchable_pdf_uri', document_uri + '.searchable.pdf')
create_searchable_pdf = event.get('create_searchable_pdf', True)
page = event.get('page')
event['temp_uri_prefix'] = temp_uri_prefix
# AWS Lambda auto-retries errors for 3x. This should make it disable retrying...kinda. See https://stackoverflow.com/questions/32064038/aws-lambda-function-triggering-multiple-times-for-a-single-event
aws_context_retry_uri = os.path.join(temp_uri_prefix, 'aws_lambda_request_ids', context.aws_request_id)
if uri_exists(aws_context_retry_uri):
return
uri_dump(aws_context_retry_uri, '', mode='w')
start_time = time.time()
logger.info('{} invoked with event {}.'.format(os.environ['AWS_LAMBDA_FUNCTION_NAME'], json.dumps(event)))
o = urlparse(document_uri)
_, ext = os.path.splitext(o.path) # get format from extension
ext = ext.lower()
extract_func = PARSE_FUNCS.get(ext)
if extract_func is None:
raise ValueError('<{}> has unsupported extension "{}".'.format(document_uri, ext))
with NamedTemporaryFile(mode='wb', suffix=ext, delete=False) as f:
document_path = f.name
f.write(uri_read(document_uri, mode='rb'))
logger.debug('Downloaded <{}> to <{}>.'.format(document_uri, document_path))
#end with
textractor_results = {}
searchable_pdf_path = None
try:
textractor_results = extract_func(document_path, event, context, create_searchable_pdf=create_searchable_pdf)
meta = textractor_results.pop('meta', {})
meta['method'] = textractor_results['method']
text = textractor_results.pop('text', '')
textractor_results['size'] = len(text)
uri_dump(text_uri, text, mode='w', textio_args={'errors': 'ignore'}, storage_args=dict(ContentType='text/plain', Metadata=meta))
if len(text) == 0: logger.warning('<{}> does not contain any content.'.format(document_uri))
searchable_pdf_path = textractor_results.pop('searchable_pdf_path', None)
if searchable_pdf_path:
assert os.path.isfile(searchable_pdf_path)
with open(searchable_pdf_path, 'rb') as f:
contents = f.read()
uri_dump(searchable_pdf_uri, contents, mode='wb', storage_args=dict(ContentType='application/pdf', Metadata=meta))
logger.debug('Searchable PDF version of <{}> saved to <{}>.'.format(document_uri, searchable_pdf_uri))
#end with
textractor_results['searchable_pdf_uri'] = searchable_pdf_uri
#end if
textractor_results['success'] = True
textractor_results.update(meta)
if page: logger.debug('Extracted page {} of <{}> to <{}> (took {:.3f} seconds).'.format(page, document_uri, text_uri, time.time() - start_time))
else: logger.debug('Extracted pages of <{}> to <{}> (took {:.3f} seconds).'.format(document_uri, text_uri, time.time() - start_time))
except Exception as e:
logger.exception('Extraction exception for <{}>'.format(document_uri))
textractor_results = dict(success=False, reason=str(e))
uri_dump(text_uri, '', mode='w', textio_args={'errors': 'ignore'}, storage_args=dict(ContentType='text/plain', Metadata=dict(Exception=str(e))))
finally:
os.remove(document_path)
if searchable_pdf_path: os.remove(searchable_pdf_path)
#end try
payload = event.copy()
payload['text_uri'] = text_uri
if create_searchable_pdf:
payload['searchable_pdf_uri'] = textractor_results.get('searchable_pdf_uri')
for cb in event.get('callbacks', []):
if cb['step'] == 'textractor':
try:
uri_dump(cb['uri'], json.dumps(payload), mode='w')
logger.info('Called callback {} with payload {}.'.format(json.dumps(cb), json.dumps(payload)))
except Exception as e: logger.exception('Callback exception for {} with payload {}.'.format(json.dumps(cb), json.dumps(payload)))
#end if
#end for
payload.setdefault('results', {})
payload['results']['textractor'] = textractor_results
logger.debug('Textraction complete.')
return payload
#end def
def _pdf_to_text(document_path):
text_path = document_path + '.txt'
_get_subprocess_output([os.path.join(BIN_DIR, 'pdftotext'), '-layout', '-nopgbrk', '-eol', 'unix', document_path, text_path], shell=False, env=dict(LD_LIBRARY_PATH=os.path.join(LIB_DIR, 'pdftotext')))
with io.open(text_path, mode='r', encoding='utf-8', errors='ignore') as f:
text = f.read().strip()
os.remove(text_path)
return text
#end def
def _get_subprocess_output(*args, **kwargs):
global logger
return get_subprocess_output(*args, logger=logger, **kwargs)
#end def
def pdf_to_text_with_ocr(document_path, event, context, create_searchable_pdf=True):
global logger
document_uri = event['document_uri']
page = event.get('page')
temp_uri_prefix = event['temp_uri_prefix']
if page is not None:
return pdf_to_text_with_ocr_single_page(document_path, event, context, create_searchable_pdf=create_searchable_pdf)
# This is more reliable than using PyPDF2
pdfinfo_output = _get_subprocess_output([os.path.join(BIN_DIR, 'pdfinfo'), document_path], shell=False, env=dict(LD_LIBRARY_PATH=os.path.join(LIB_DIR, 'pdftotext')))
pdfinfo_output = pdfinfo_output.decode('ascii', errors='ignore')
m = re.search(r'^Pages\:(.+)$', pdfinfo_output, flags=re.M | re.U)
if not m: raise Exception('Unable to get page count from pdfinfo:\n{}'.format(pdfinfo_output))
num_pages = int(m.group(1))
with closing(asyncio.new_event_loop()) as event_loop:
executor = concurrent.futures.ThreadPoolExecutor(max_workers=2) # Fix runtime error with "Event loop is closed" (see https://stackoverflow.com/questions/32598231/asyncio-runtimeerror-event-loop-is-closed/32615276#32615276)
event_loop.set_default_executor(executor)
session = aiobotocore.get_session(loop=event_loop)
async def _invoke_textract_ocr_tasks(_completed_text_contents, _completed_searchable_pdf_fnames, timeout):
tasks = []
cur_uuid = str(uuid4())
for page in range(1, num_pages + 1):
page_uuid = '{}_{:04d}'.format(cur_uuid, page) # base name for each page's intermediate output
page_text_uri = os.path.join(temp_uri_prefix, '{}.txt'.format(page_uuid))
payload = dict(document_uri=document_uri, text_uri=page_text_uri, temp_uri_prefix=temp_uri_prefix, page=page)
if create_searchable_pdf:
page_searchable_pdf_uri = os.path.join(temp_uri_prefix, '{}.pdf'.format(page_uuid))
payload['searchable_pdf_uri'] = page_searchable_pdf_uri
#end if
tasks.append(invoke_textract_ocr(LAMBDA_FUNCTION_NAME, payload, session, logger))
#end for
completed, pending = await asyncio.wait(tasks, timeout=timeout, loop=event_loop) # this is where the magic happens
for task in pending:
task.cancel()
for task in completed:
try: (page, page_text, page_searchable_pdf_fname) = task.result()
except TypeError: continue
_completed_text_contents[page] = page_text
_completed_searchable_pdf_fnames[page] = page_searchable_pdf_fname
#end for
#end def
textract_output_wait_timeout = (context.get_remaining_time_in_millis() / 1000.0) - MERGE_SEARCHABLE_PDF_DURATION - RETURN_RESULTS_DURATION - TEXTRACT_OUTPUT_WAIT_BUFFER_TIME # asyncio.wait seems to overshoot around 7 seconds everytime
completed_text_contents, completed_searchable_pdf_fnames = {}, {}
if textract_output_wait_timeout <= 0:
logger.warning('Wait timeout for OCR output is < 0!')
else:
event_loop.run_until_complete(_invoke_textract_ocr_tasks(completed_text_contents, completed_searchable_pdf_fnames, textract_output_wait_timeout))
#end if
executor.shutdown(wait=False)
#end with
missing_text_pages, empty_content_pages, page_contents = [], [], []
for page in range(1, num_pages + 1):
content = completed_text_contents.get(page)
if content is None: missing_text_pages.append(page)
elif content: page_contents.append(content)
else: empty_content_pages.append(page)
#end for
text = '\n\n'.join(page_contents).strip()
searchable_pdf_path = None
missing_searchable_pdf_pages, pdf_pages_filenames = [], []
if create_searchable_pdf:
try:
# Extract pages from the original file if we are unable to textract / get searchable versions of the page
for page in range(1, num_pages + 1):
searchable_pdf_fname = completed_searchable_pdf_fnames.get(page)
if searchable_pdf_fname is None:
missing_searchable_pdf_pages.append(page)
with NamedTemporaryFile(suffix='.pdf', delete=False) as f:
original_pdf_page_fname = f.name
_get_subprocess_output([os.path.join(BIN_DIR, 'pdfseparate'), '-f', str(page), '-l', str(page), document_path, original_pdf_page_fname], shell=False, env=dict(LD_LIBRARY_PATH=os.path.join(LIB_DIR, 'pdftotext')))
pdf_pages_filenames.append(original_pdf_page_fname)
else:
pdf_pages_filenames.append(searchable_pdf_fname)
#end for
assert len(pdf_pages_filenames) == num_pages
# Merge the individual pages of searchable PDFs together
merge_searchable_pdf_timeout = (context.get_remaining_time_in_millis() / 1000.0) - RETURN_RESULTS_DURATION
with NamedTemporaryFile(suffix='.pdf', delete=False) as f:
searchable_pdf_path = f.name
_get_subprocess_output([os.path.join(BIN_DIR, 'gs'), '-sDEVICE=pdfwrite', '-dBATCH', '-dNOPAUSE', '-q', '-dPDFSETTINGS=/ebook', '-sOutputFile={}'.format(searchable_pdf_path)] + pdf_pages_filenames, shell=False, timeout=merge_searchable_pdf_timeout) # merge and compress pdf
except subprocess.TimeoutExpired:
searchable_pdf_path = None
logger.warning('Timeout while merging searchable PDF for <{}>.'.format(document_uri))
except Exception as e:
logger.exception('Exception while merging searchable PDF for <{}>.'.format(document_uri))
finally:
for fname in pdf_pages_filenames:
try: os.remove(fname)
except Exception as e: logger.exception('searchable_pdf_remove_exception', filename=fname, document_uri=document_uri)
#end for
#end try
#end if
meta = dict(num_pages=str(num_pages))
if missing_text_pages:
logger.info('Missing pages {} in <{}>.'.format(missing_text_pages, document_uri))
meta['missing_text_pages'] = ','.join(str(p) for p in missing_text_pages)
#end if
if empty_content_pages:
logger.info('Empty content pages {} in <{}>.'.format(empty_content_pages, document_uri))
meta['empty_content_pages'] = ','.join(str(p) for p in empty_content_pages)
#end if
if missing_searchable_pdf_pages:
logger.info('Missing searchable PDF pages {} in <{}>.'.format(missing_searchable_pdf_pages, document_uri))
meta['missing_searchable_pdf_pages'] = ','.join(str(p) for p in missing_searchable_pdf_pages)
#end if
return dict(success=True, text=text, searchable_pdf_path=searchable_pdf_path, method='pdf_to_text_with_ocr', meta=meta)
#end def
def pdf_to_text_with_ocr_single_page(document_path, event, context, create_searchable_pdf=True):
page = event['page']
with NamedTemporaryFile(suffix='.png', delete=False) as f:
image_page_path = f.name
try:
cmdline = [os.path.join(BIN_DIR, 'gs'), '-sDEVICE=png16m', '-dFirstPage={}'.format(page), '-dLastPage={}'.format(page), '-dINTERPOLATE', '-r300', '-o', image_page_path, '-dNOPAUSE', '-dSAFER', '-c', '67108864', 'setvmthreshold', '-dGraphicsAlphaBits=4', '-dTextAlphaBits=4', '-f', document_path] # extract the page as an image
output = _get_subprocess_output(cmdline, shell=False)
output = output.decode('ascii', errors='ignore')
if os.path.getsize(image_page_path) == 0:
raise Exception('Ghostscript image extraction failed with output:\n{}'.format(output))
results = image_to_text(image_page_path, event, context, create_searchable_pdf=create_searchable_pdf)
finally:
os.remove(image_page_path)
results['page'] = page
return results
#end def
def image_to_text(document_path, event, context, create_searchable_pdf=True):
_, ext = os.path.splitext(document_path)
ext = ext.lower()
cmdline = [os.path.join(BIN_DIR, 'tesseract'), document_path, document_path, '-l', 'eng', '-psm', '1', '--tessdata-dir', os.path.join(LIB_DIR, 'tesseract')]
if create_searchable_pdf:
cmdline += ['pdf']
_get_subprocess_output(cmdline, shell=False, env=dict(LD_LIBRARY_PATH=os.path.join(LIB_DIR, 'tesseract')))
if create_searchable_pdf:
searchable_pdf_path = document_path + '.pdf'
text = _pdf_to_text(searchable_pdf_path)
else:
searchable_pdf_path = None
with io.open(document_path + '.txt', mode='r', encoding='utf-8', errors='ignore') as f:
text = f.read().strip()
#end def
return dict(success=True, text=text, method='image_to_text', searchable_pdf_path=searchable_pdf_path)
#end def
PARSE_FUNCS = {
'.pdf': pdf_to_text_with_ocr,
'.png': image_to_text,
'.tiff': image_to_text,
'.tif': image_to_text,
'.jpg': image_to_text,
'.jpeg': image_to_text,
}