-
Notifications
You must be signed in to change notification settings - Fork 1
/
ocr.py
72 lines (59 loc) · 2.04 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Image to Text Conversion Service
"""
# No longer used. As chegg has started giving transcript of the image in the response.
import json
from concurrent.futures import ThreadPoolExecutor
import requests
from utils import log
IMAGE_TO_TEXT_API_URL = None
IMAGE_TO_TEXT_API_KEY = None
IMAGE_TO_TEXT_API_PAYLOAD = {'language': 'eng',
'isOverlayRequired': 'false',
'url': '',
'iscreatesearchablepdf': 'false',
'issearchablepdfhidetextlayer': 'false'
}
IMAGE_TO_TEXT_API_HEADERS = {
'apikey': ''
}
class Ocr:
"""
Convert the image at given link into text.
"""
TIMEOUT_FOR_REQUESTS = (10, 20)
def __init__(self, urls=None):
"""
Initialize the Ocr class with a list of URLs.
"""
if urls is None:
urls = []
self.urls = urls
@staticmethod
def parse_image_to_text_response(res_data):
"""
Parse the response data from the image to text API.
"""
log('Parsing image to text api response.')
res_data = json.loads(res_data)
return res_data["ParsedResults"][0]["ParsedText"]
@staticmethod
def image_to_text(url):
"""
Convert the image at the given URL to text.
"""
log(f'Converting image to text with url -> {url}')
payload = IMAGE_TO_TEXT_API_PAYLOAD
payload['url'] = url
headers = IMAGE_TO_TEXT_API_HEADERS
headers['apikey'] = IMAGE_TO_TEXT_API_KEY
response = requests.request("POST", IMAGE_TO_TEXT_API_URL, headers=headers,
data=payload, timeout=Ocr.TIMEOUT_FOR_REQUESTS)
return Ocr.parse_image_to_text_response(response.text)
def analyze_images(self):
"""
Analyze the images and return the combined text.
"""
with ThreadPoolExecutor() as exe:
results = exe.map(self.image_to_text, self.urls)
return ' '.join(results).lower()