-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathM_ImageCapScrape.py
61 lines (48 loc) · 2.01 KB
/
M_ImageCapScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import fitz # PyMuPDF
from PyPDF2 import PdfReader
import re
import PIL.Image
from PIL import Image
import io
import base64
def extract_images_and_captions(pdf_path):
print(pdf_path)
pdf_document = fitz.open(pdf_path)
images_with_captions = []
images_cap = {}
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
images = page.get_images(full=True)
page_text = pdf_document[page_num].get_text()
# Use regex to find all captions associated with figures
# figure_captions = re.findall(r'\bFigure\s+\d+.*\d*:\s+(.*)', page_text, re.IGNORECASE)
figure_captions = [match.group() for match in re.finditer(r'\bFig\b.*\d+.+|\bFigure\s+\d+.*\d*:\s+(.*)',page_text)]
# print(figure_captions)
for img_index, img_info in enumerate(images):
xref = img_info[0]
base_image = pdf_document.extract_image(xref)
# extension = base_image["ext"]
# img = PIL.Image.open(io.BytesIO(base_image["image"]))
# img.save(open(f"OCR_Rec/Images/Captions/{xref}.{extension}", "wb"))
image_bytes = base_image["image"]
if img_index < len(figure_captions):
caption = figure_captions[img_index]
else:
try:
caption = figure_captions[img_index-1]
except:
caption = None
pil_image = Image.open(io.BytesIO(image_bytes))
buffered = io.BytesIO()
pil_image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
images_cap[caption] = img_str
# images_cap[caption] = image_bytes
# pil_image = Image.open(io.BytesIO(image_bytes))
images_with_captions.append({
"caption": caption,
"image": image_bytes
})
pdf_document.close()
# return images_with_captions, images_cap
return images_cap