Skip to content

Commit

Permalink
Merge pull request #540 from GooeyAI/pptx-gdocs
Browse files Browse the repository at this point in the history
fix: pptx uploaded on google docs
  • Loading branch information
milovate authored Dec 20, 2024
2 parents 6573a92 + cc67f67 commit f2662b7
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 94 deletions.
44 changes: 25 additions & 19 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import io

from furl import furl
import requests

from daras_ai_v2.exceptions import UserError
from daras_ai_v2.functional import flatmap_parallel
from daras_ai_v2.exceptions import raise_for_status


def is_gdrive_url(f: furl) -> bool:
Expand Down Expand Up @@ -60,36 +62,38 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]:
def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]:
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

# get drive file id
file_id = url_to_gdrive_file_id(f)
# get metadata
service = discovery.build("drive", "v3")
# get files in drive directly
if f.host == "drive.google.com":
request = service.files().get_media(
fileId=file_id,
supportsAllDrives=True,
)
# export google docs to appropriate type
else:
mime_type, _ = docs_export_mimetype(f)
request = service.files().export_media(
fileId=file_id,
mimeType=mime_type,
)

if f.host != "drive.google.com":
# export google docs to appropriate type
export_mime_type, _ = docs_export_mimetype(f)
if f_url_export := export_links.get(export_mime_type, None):
r = requests.get(f_url_export)
file_bytes = r.content
raise_for_status(r, is_user_url=True)
return file_bytes, export_mime_type

request = service.files().get_media(
fileId=file_id,
supportsAllDrives=True,
)
# download
file = io.BytesIO()
downloader = MediaIoBaseDownload(file, request)
done = False
while done is False:
_, done = downloader.next_chunk()
# print(f"Download {int(status.progress() * 100)}%")
f_bytes = file.getvalue()
return f_bytes, mime_type
file_bytes = file.getvalue()

return file_bytes, mime_type


def docs_export_mimetype(f: furl) -> tuple[str, str]:
Expand All @@ -109,8 +113,10 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]:
mime_type = "text/csv"
ext = ".csv"
elif "presentation" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
mime_type = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
ext = ".pptx"
elif "drawings" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
Expand All @@ -128,7 +134,7 @@ def gdrive_metadata(file_id: str) -> dict:
.get(
supportsAllDrives=True,
fileId=file_id,
fields="name,md5Checksum,modifiedTime,mimeType,size",
fields="name,md5Checksum,modifiedTime,mimeType,size,exportLinks",
)
.execute()
)
Expand Down
2 changes: 1 addition & 1 deletion daras_ai_v2/glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def validate_glossary_document(document: str):

metadata = doc_url_to_file_metadata(document)
f_bytes, mime_type = download_content_bytes(
f_url=document, mime_type=metadata.mime_type
f_url=document, mime_type=metadata.mime_type, export_links=metadata.export_links
)
df = tabular_bytes_to_str_df(
f_name=metadata.name, f_bytes=f_bytes, mime_type=mime_type
Expand Down
125 changes: 57 additions & 68 deletions daras_ai_v2/office_utils_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ def pptx_to_text_pages(f: typing.BinaryIO, use_form_reco: bool = False) -> list[
except Exception as e:
slide_content.append(f" Error processing shape: {e}")

if slide.has_notes_slide:
slide_content.extend(handle_author_notes(slide))

slides_text.append("\n".join(slide_content) + "\n")

return slides_text


Expand All @@ -43,81 +47,55 @@ def handle_text_elements(shape) -> list[str]:
Handles text elements within a shape, including lists.
"""
text_elements = []
is_a_list = False
is_list_group_created = False
enum_list_item_value = 0
bullet_type = "None"
list_label = "LIST"
namespaces = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}

# Identify if shape contains lists
current_list_type = None
list_item_index = 0

for paragraph in shape.text_frame.paragraphs:
p = paragraph._element
paragraph_text = ""
is_list_item = False

# Determine list type
if p.find(".//a:buChar", namespaces=namespaces) is not None:
bullet_type = "Bullet"
is_a_list = True
current_list_type = "Bullet"
is_list_item = True
elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None:
bullet_type = "Numbered"
is_a_list = True
current_list_type = "Numbered"
is_list_item = True
elif paragraph.level > 0: # Indented text is also treated as a list
current_list_type = "Bullet"
is_list_item = True
else:
is_a_list = False

if paragraph.level > 0:
is_a_list = True

if is_a_list:
if bullet_type == "Numbered":
list_label = "ORDERED_LIST"

# Iterate through paragraphs to build up text
for paragraph in shape.text_frame.paragraphs:
p = paragraph._element
enum_list_item_value += 1
inline_paragraph_text = ""
inline_list_item_text = ""
doc_label = "PARAGRAPH"

for e in p.iterfind(".//a:r", namespaces=namespaces):
if len(e.text.strip()) > 0:
e_is_a_list_item = False
is_numbered = False
if p.find(".//a:buChar", namespaces=namespaces) is not None:
bullet_type = "Bullet"
e_is_a_list_item = True
elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None:
bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True
else:
e_is_a_list_item = False

if e_is_a_list_item:
if len(inline_paragraph_text) > 0:
text_elements.append(inline_paragraph_text)
inline_list_item_text += e.text
current_list_type = None
list_item_index = 0 # Reset numbering if no list

# Process paragraph text
for run in p.iterfind(".//a:r", namespaces=namespaces):
run_text = run.text.strip() if run.text else ""
if run_text:
paragraph_text += run_text

if is_list_item:
if current_list_type == "Numbered":
list_item_index += 1
list_prefix = f"{list_item_index}."
else:
list_prefix = "•" # Default bullet symbol
text_elements.append(f"{list_prefix} {paragraph_text}")
else:
# Handle placeholders for titles or subtitles
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type == PP_PLACEHOLDER.TITLE:
text_elements.append(f"TITLE: {paragraph_text}")
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
text_elements.append(f"SECTION_HEADER: {paragraph_text}")
else:
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE,
PP_PLACEHOLDER.TITLE,
]:
doc_label = "TITLE"
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
doc_label = "SECTION_HEADER"
enum_list_item_value = 0
inline_paragraph_text += e.text

if len(inline_paragraph_text) > 0:
text_elements.append(inline_paragraph_text)

if len(inline_list_item_text) > 0:
enum_marker = ""
if is_numbered:
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
is_list_group_created = True
text_elements.append(f"{enum_marker} {inline_list_item_text}")
text_elements.append(paragraph_text)
else:
text_elements.append(paragraph_text)

return text_elements

Expand Down Expand Up @@ -171,7 +149,7 @@ def handle_tables(shape) -> list[str]:
for row in grid[1:]:
line = "|" + "|".join(row) + "|"
table_text.append(line)
print(line)
# print(line)

return table_text

Expand Down Expand Up @@ -207,6 +185,17 @@ def handle_charts(shape) -> list[str]:
return chart_text


def handle_author_notes(slide) -> list[str]:

notes = []
if slide.notes_slide.notes_text_frame:
notes_text = slide.notes_slide.notes_text_frame.text.strip()
if notes_text:
notes.append("Speaker Notes:")
notes.append(notes_text)
return notes


# TODO :azure form reco to extract text from images
def handle_pictures(shape):
pass
21 changes: 17 additions & 4 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = meta.get("md5Checksum") or meta.get("modifiedTime")
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks", {})
else:
try:
if is_user_uploaded_url(f_url):
Expand All @@ -327,6 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
mime_type = None
etag = None
total_bytes = 0
export_links = {}
else:
name = (
r.headers.get("content-disposition", "")
Expand All @@ -338,6 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = etag.strip('"')
mime_type = get_mimetype_from_response(r)
total_bytes = int(r.headers.get("content-length") or 0)
export_links = {}
# extract filename from url as a fallback
if not name:
if is_user_uploaded_url(f_url):
Expand All @@ -347,9 +350,12 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
# guess mimetype from name as a fallback
if not mime_type:
mime_type = mimetypes.guess_type(name)[0]
return FileMetadata(

file_metadata = FileMetadata(
name=name, etag=etag, mime_type=mime_type or "", total_bytes=total_bytes
)
file_metadata.export_links = export_links or {}
return file_metadata


def yt_dlp_get_video_entries(url: str) -> list[dict]:
Expand Down Expand Up @@ -650,7 +656,10 @@ def doc_url_to_text_pages(
Download document from url and convert to text pages.
"""
f_bytes, mime_type = download_content_bytes(
f_url=f_url, mime_type=file_meta.mime_type, is_user_url=is_user_url
f_url=f_url,
mime_type=file_meta.mime_type,
is_user_url=is_user_url,
export_links=file_meta.export_links,
)
if not f_bytes:
return []
Expand All @@ -664,14 +673,18 @@ def doc_url_to_text_pages(


def download_content_bytes(
*, f_url: str, mime_type: str, is_user_url: bool = True
*,
f_url: str,
mime_type: str,
is_user_url: bool = True,
export_links: dict[str, str] = {},
) -> tuple[bytes, str]:
if is_yt_dlp_able_url(f_url):
return download_youtube_to_wav(f_url), "audio/wav"
f = furl(f_url)
if is_gdrive_url(f):
# download from google drive
return gdrive_download(f, mime_type)
return gdrive_download(f, mime_type, export_links)
try:
# download from url
if is_user_uploaded_url(f_url):
Expand Down
4 changes: 4 additions & 0 deletions files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ class FileMetadata(models.Model):
mime_type = models.CharField(max_length=255, default="", blank=True)
total_bytes = models.PositiveIntegerField(default=0, blank=True)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.export_links = {}

def __str__(self):
ret = f"{self.name or 'Unnamed'} - {self.mime_type}"
if self.total_bytes:
Expand Down
2 changes: 1 addition & 1 deletion recipes/BulkRunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ def get_columns(files: list[str]) -> list[str]:
def read_df_any(f_url: str) -> "pd.DataFrame":
file_meta = doc_url_to_file_metadata(f_url)
f_bytes, mime_type = download_content_bytes(
f_url=f_url, mime_type=file_meta.mime_type
f_url=f_url, mime_type=file_meta.mime_type, export_links=file_meta.export_links
)
df = tabular_bytes_to_any_df(
f_name=file_meta.name, f_bytes=f_bytes, mime_type=mime_type
Expand Down
4 changes: 3 additions & 1 deletion recipes/DocExtract.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,9 @@ def process_source(
elif is_video:
f = furl(webpage_url)
if is_gdrive_url(f):
f_bytes, _ = gdrive_download(f, doc_meta.mime_type)
f_bytes, _ = gdrive_download(
f, doc_meta.mime_type, doc_meta.export_links
)
webpage_url = upload_file_from_bytes(
doc_meta.name, f_bytes, content_type=doc_meta.mime_type
)
Expand Down

0 comments on commit f2662b7

Please sign in to comment.