From 2555cb4f69c4c106a9cd78431fbc196dc823a324 Mon Sep 17 00:00:00 2001 From: milovate Date: Sat, 30 Nov 2024 06:52:20 +0530 Subject: [PATCH 1/7] add error handling for Google Drive downloads and update MIME type for presentations --- daras_ai_v2/gdrive_downloader.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 2720b19a9..016dbfa76 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -63,6 +63,7 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: from googleapiclient import discovery from googleapiclient.http import MediaIoBaseDownload + from googleapiclient.errors import HttpError # get drive file id file_id = url_to_gdrive_file_id(f) @@ -84,10 +85,28 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: # download file = io.BytesIO() downloader = MediaIoBaseDownload(file, request) + done = False - while done is False: - _, done = downloader.next_chunk() - # print(f"Download {int(status.progress() * 100)}%") + try: + while done is False: + _, done = downloader.next_chunk() + # print(f"Download {int(status.progress() * 100)}%") + except HttpError as error: + if ( + mime_type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ): + # print(f"Error downloading file: {error}. Retrying...") + request = service.files().get_media( + fileId=file_id, + supportsAllDrives=True, + ) + downloader = MediaIoBaseDownload(file, request) + done = False + + while done is False: + _, done = downloader.next_chunk() + f_bytes = file.getvalue() return f_bytes, mime_type @@ -109,8 +128,10 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]: mime_type = "text/csv" ext = ".csv" elif "presentation" in f.path.segments: - mime_type = "application/pdf" - ext = ".pdf" + mime_type = ( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + ext = ".pptx" elif "drawings" in f.path.segments: mime_type = "application/pdf" ext = ".pdf" From 9909115ed0df6cd81762d421804aaa3c53357458 Mon Sep 17 00:00:00 2001 From: milovate Date: Sat, 30 Nov 2024 08:53:55 +0530 Subject: [PATCH 2/7] fix :pptx export limit --- daras_ai_v2/gdrive_downloader.py | 46 +++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 016dbfa76..9edc31f3b 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -62,15 +62,25 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: from googleapiclient import discovery - from googleapiclient.http import MediaIoBaseDownload - from googleapiclient.errors import HttpError # get drive file id file_id = url_to_gdrive_file_id(f) # get metadata service = discovery.build("drive", "v3") + + request, mime_type = service_request(service, file_id, f, mime_type) + file_bytes, mime_type = download_blob_file_content( + service, request, file_id, f, mime_type + ) + + return file_bytes, mime_type + + +def service_request( + service, file_id: str, f: furl, mime_type: str, retried_request=False +) -> tuple[any, str]: # get files in drive directly - if f.host == "drive.google.com": + if f.host == "drive.google.com" or retried_request: request = service.files().get_media( fileId=file_id, supportsAllDrives=True, @@ -82,6 +92,15 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: fileId=file_id, mimeType=mime_type, ) + return request, mime_type + + +def download_blob_file_content( + service, request, file_id: str, f: furl, mime_type: str +) -> tuple[bytes, str]: + from googleapiclient.http import MediaIoBaseDownload + from googleapiclient.errors import HttpError + # download file = io.BytesIO() downloader = MediaIoBaseDownload(file, request) @@ -92,18 +111,15 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: _, done = downloader.next_chunk() # print(f"Download {int(status.progress() * 100)}%") except HttpError as error: - if ( - mime_type - == "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ): - # print(f"Error downloading file: {error}. Retrying...") - request = service.files().get_media( - fileId=file_id, - supportsAllDrives=True, + # retry if error exporting google docs format files e.g .pptx files uploaded to docs.google.com + if "presentation" in f.path.segments: + # update mime_type to download the file directly + mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + request, _ = service_request( + service, file_id, f, mime_type, retried_request=True ) downloader = MediaIoBaseDownload(file, request) done = False - while done is False: _, done = downloader.next_chunk() @@ -128,10 +144,8 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]: mime_type = "text/csv" ext = ".csv" elif "presentation" in f.path.segments: - mime_type = ( - "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ) - ext = ".pptx" + mime_type = "application/pdf" + ext = ".pdf" elif "drawings" in f.path.segments: mime_type = "application/pdf" ext = ".pdf" From 855adcef5cdaa23630f877f82dd3f53cdc98e246 Mon Sep 17 00:00:00 2001 From: milovate Date: Mon, 2 Dec 2024 11:38:10 +0530 Subject: [PATCH 3/7] fix: gdocs-docx upload --- daras_ai_v2/gdrive_downloader.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 9edc31f3b..398172d97 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -111,7 +111,7 @@ def download_blob_file_content( _, done = downloader.next_chunk() # print(f"Download {int(status.progress() * 100)}%") except HttpError as error: - # retry if error exporting google docs format files e.g .pptx files uploaded to docs.google.com + # retry if error exporting google docs format files e.g .pptx/.docx files uploaded to docs.google.com if "presentation" in f.path.segments: # update mime_type to download the file directly mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" @@ -123,6 +123,19 @@ def download_blob_file_content( while done is False: _, done = downloader.next_chunk() + elif "document" in f.path.segments: + mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + request, _ = service_request( + service, file_id, f, mime_type, retried_request=True + ) + downloader = MediaIoBaseDownload(file, request) + done = False + while done is False: + _, done = downloader.next_chunk() + + else: + raise error + f_bytes = file.getvalue() return f_bytes, mime_type From a2226b879b698004b8a14796d8693d45dfeb5105 Mon Sep 17 00:00:00 2001 From: milovate Date: Tue, 17 Dec 2024 17:54:47 +0530 Subject: [PATCH 4/7] add export links to download pptx files --- daras_ai_v2/gdrive_downloader.py | 72 ++++++++++++++++++-------------- daras_ai_v2/vector_search.py | 21 ++++++++-- files/models.py | 4 ++ 3 files changed, 62 insertions(+), 35 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 398172d97..1c8fae060 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -1,9 +1,11 @@ import io from furl import furl - +import requests +from loguru import logger from daras_ai_v2.exceptions import UserError from daras_ai_v2.functional import flatmap_parallel +from daras_ai_v2.exceptions import raise_for_status def is_gdrive_url(f: furl) -> bool: @@ -60,7 +62,7 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str return filter(None, urls) -def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: +def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]: from googleapiclient import discovery # get drive file id @@ -70,7 +72,7 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: request, mime_type = service_request(service, file_id, f, mime_type) file_bytes, mime_type = download_blob_file_content( - service, request, file_id, f, mime_type + service, request, file_id, f, mime_type, export_links ) return file_bytes, mime_type @@ -96,7 +98,7 @@ def service_request( def download_blob_file_content( - service, request, file_id: str, f: furl, mime_type: str + service, request, file_id: str, f: furl, mime_type: str, export_links: dict ) -> tuple[bytes, str]: from googleapiclient.http import MediaIoBaseDownload from googleapiclient.errors import HttpError @@ -105,41 +107,47 @@ def download_blob_file_content( file = io.BytesIO() downloader = MediaIoBaseDownload(file, request) - done = False - try: - while done is False: - _, done = downloader.next_chunk() - # print(f"Download {int(status.progress() * 100)}%") - except HttpError as error: - # retry if error exporting google docs format files e.g .pptx/.docx files uploaded to docs.google.com - if "presentation" in f.path.segments: - # update mime_type to download the file directly - mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" - request, _ = service_request( - service, file_id, f, mime_type, retried_request=True - ) - downloader = MediaIoBaseDownload(file, request) - done = False - while done is False: - _, done = downloader.next_chunk() + if ( + mime_type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ): + # logger.debug(f"Downloading {str(f)!r} using export links") + f_url_export = export_links.get(mime_type, None) + if f_url_export: - elif "document" in f.path.segments: - mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - request, _ = service_request( - service, file_id, f, mime_type, retried_request=True + f_bytes = download_from_exportlinks(f_url_export) + else: + request = service.files().get_media( + fileId=file_id, + supportsAllDrives=True, ) downloader = MediaIoBaseDownload(file, request) + done = False while done is False: _, done = downloader.next_chunk() + # print(f"Download {int(status.progress() * 100)}%") + f_bytes = file.getvalue() - else: - raise error + else: + done = False + while done is False: + _, done = downloader.next_chunk() + # print(f"Download {int(status.progress() * 100)}%") + f_bytes = file.getvalue() - f_bytes = file.getvalue() return f_bytes, mime_type +def download_from_exportlinks(f: furl) -> bytes: + try: + r = requests.get(f) + f_bytes = r.content + except requests.exceptions.RequestException as e: + raise_for_status(e) + return f_bytes + + def docs_export_mimetype(f: furl) -> tuple[str, str]: """ return the mimetype to export google docs - https://developers.google.com/drive/api/guides/ref-export-formats @@ -157,8 +165,10 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]: mime_type = "text/csv" ext = ".csv" elif "presentation" in f.path.segments: - mime_type = "application/pdf" - ext = ".pdf" + mime_type = ( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + ext = ".pptx" elif "drawings" in f.path.segments: mime_type = "application/pdf" ext = ".pdf" @@ -176,7 +186,7 @@ def gdrive_metadata(file_id: str) -> dict: .get( supportsAllDrives=True, fileId=file_id, - fields="name,md5Checksum,modifiedTime,mimeType,size", + fields="name,md5Checksum,modifiedTime,mimeType,size,exportLinks", ) .execute() ) diff --git a/daras_ai_v2/vector_search.py b/daras_ai_v2/vector_search.py index f78c39260..3d54d5383 100644 --- a/daras_ai_v2/vector_search.py +++ b/daras_ai_v2/vector_search.py @@ -310,6 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = meta.get("md5Checksum") or meta.get("modifiedTime") mime_type = meta["mimeType"] total_bytes = int(meta.get("size") or 0) + export_links = meta.get("exportLinks", {}) else: try: if is_user_uploaded_url(f_url): @@ -327,6 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: mime_type = None etag = None total_bytes = 0 + export_links = {} else: name = ( r.headers.get("content-disposition", "") @@ -338,6 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = etag.strip('"') mime_type = get_mimetype_from_response(r) total_bytes = int(r.headers.get("content-length") or 0) + export_links = {} # extract filename from url as a fallback if not name: if is_user_uploaded_url(f_url): @@ -347,9 +350,12 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: # guess mimetype from name as a fallback if not mime_type: mime_type = mimetypes.guess_type(name)[0] - return FileMetadata( + + file_metadata = FileMetadata( name=name, etag=etag, mime_type=mime_type or "", total_bytes=total_bytes ) + file_metadata.export_links = export_links or {} + return file_metadata def yt_dlp_get_video_entries(url: str) -> list[dict]: @@ -650,7 +656,10 @@ def doc_url_to_text_pages( Download document from url and convert to text pages. """ f_bytes, mime_type = download_content_bytes( - f_url=f_url, mime_type=file_meta.mime_type, is_user_url=is_user_url + f_url=f_url, + mime_type=file_meta.mime_type, + is_user_url=is_user_url, + export_links=file_meta.export_links, ) if not f_bytes: return [] @@ -664,14 +673,18 @@ def doc_url_to_text_pages( def download_content_bytes( - *, f_url: str, mime_type: str, is_user_url: bool = True + *, + f_url: str, + mime_type: str, + is_user_url: bool = True, + export_links: dict[str, str] = {}, ) -> tuple[bytes, str]: if is_yt_dlp_able_url(f_url): return download_youtube_to_wav(f_url), "audio/wav" f = furl(f_url) if is_gdrive_url(f): # download from google drive - return gdrive_download(f, mime_type) + return gdrive_download(f, mime_type, export_links) try: # download from url if is_user_uploaded_url(f_url): diff --git a/files/models.py b/files/models.py index 12af91a7a..afb6504cc 100644 --- a/files/models.py +++ b/files/models.py @@ -8,6 +8,10 @@ class FileMetadata(models.Model): mime_type = models.CharField(max_length=255, default="", blank=True) total_bytes = models.PositiveIntegerField(default=0, blank=True) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.export_links = {} + def __str__(self): ret = f"{self.name or 'Unnamed'} - {self.mime_type}" if self.total_bytes: From a2b4f1848e87cc3103820e6aee36094a89864446 Mon Sep 17 00:00:00 2001 From: milovate Date: Wed, 18 Dec 2024 13:05:42 +0530 Subject: [PATCH 5/7] refactor: export links to handle all google docs --- daras_ai_v2/gdrive_downloader.py | 92 +++++++------------------------- daras_ai_v2/glossary.py | 2 +- recipes/BulkRunner.py | 2 +- recipes/DocExtract.py | 4 +- 4 files changed, 25 insertions(+), 75 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 1c8fae060..3cc41c68f 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -2,7 +2,7 @@ from furl import furl import requests -from loguru import logger + from daras_ai_v2.exceptions import UserError from daras_ai_v2.functional import flatmap_parallel from daras_ai_v2.exceptions import raise_for_status @@ -64,88 +64,36 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]: from googleapiclient import discovery + from googleapiclient.http import MediaIoBaseDownload # get drive file id file_id = url_to_gdrive_file_id(f) # get metadata service = discovery.build("drive", "v3") - request, mime_type = service_request(service, file_id, f, mime_type) - file_bytes, mime_type = download_blob_file_content( - service, request, file_id, f, mime_type, export_links + if f.host != "drive.google.com": + # export google docs to appropriate type + export_mime_type, _ = docs_export_mimetype(f) + if f_url_export := export_links.get(export_mime_type, None): + r = requests.get(f_url_export) + file_bytes = r.content + raise_for_status(r) + return file_bytes, export_mime_type + + request = service.files().get_media( + fileId=file_id, + supportsAllDrives=True, ) - - return file_bytes, mime_type - - -def service_request( - service, file_id: str, f: furl, mime_type: str, retried_request=False -) -> tuple[any, str]: - # get files in drive directly - if f.host == "drive.google.com" or retried_request: - request = service.files().get_media( - fileId=file_id, - supportsAllDrives=True, - ) - # export google docs to appropriate type - else: - mime_type, _ = docs_export_mimetype(f) - request = service.files().export_media( - fileId=file_id, - mimeType=mime_type, - ) - return request, mime_type - - -def download_blob_file_content( - service, request, file_id: str, f: furl, mime_type: str, export_links: dict -) -> tuple[bytes, str]: - from googleapiclient.http import MediaIoBaseDownload - from googleapiclient.errors import HttpError - # download file = io.BytesIO() downloader = MediaIoBaseDownload(file, request) + done = False + while done is False: + _, done = downloader.next_chunk() + # print(f"Download {int(status.progress() * 100)}%") + file_bytes = file.getvalue() - if ( - mime_type - == "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ): - # logger.debug(f"Downloading {str(f)!r} using export links") - f_url_export = export_links.get(mime_type, None) - if f_url_export: - - f_bytes = download_from_exportlinks(f_url_export) - else: - request = service.files().get_media( - fileId=file_id, - supportsAllDrives=True, - ) - downloader = MediaIoBaseDownload(file, request) - - done = False - while done is False: - _, done = downloader.next_chunk() - # print(f"Download {int(status.progress() * 100)}%") - f_bytes = file.getvalue() - - else: - done = False - while done is False: - _, done = downloader.next_chunk() - # print(f"Download {int(status.progress() * 100)}%") - f_bytes = file.getvalue() - - return f_bytes, mime_type - - -def download_from_exportlinks(f: furl) -> bytes: - try: - r = requests.get(f) - f_bytes = r.content - except requests.exceptions.RequestException as e: - raise_for_status(e) - return f_bytes + return file_bytes, mime_type def docs_export_mimetype(f: furl) -> tuple[str, str]: diff --git a/daras_ai_v2/glossary.py b/daras_ai_v2/glossary.py index 87618b87c..77c252173 100644 --- a/daras_ai_v2/glossary.py +++ b/daras_ai_v2/glossary.py @@ -15,7 +15,7 @@ def validate_glossary_document(document: str): metadata = doc_url_to_file_metadata(document) f_bytes, mime_type = download_content_bytes( - f_url=document, mime_type=metadata.mime_type + f_url=document, mime_type=metadata.mime_type, export_links=metadata.export_links ) df = tabular_bytes_to_str_df( f_name=metadata.name, f_bytes=f_bytes, mime_type=mime_type diff --git a/recipes/BulkRunner.py b/recipes/BulkRunner.py index 25c99a2dc..4eb7d654e 100644 --- a/recipes/BulkRunner.py +++ b/recipes/BulkRunner.py @@ -612,7 +612,7 @@ def get_columns(files: list[str]) -> list[str]: def read_df_any(f_url: str) -> "pd.DataFrame": file_meta = doc_url_to_file_metadata(f_url) f_bytes, mime_type = download_content_bytes( - f_url=f_url, mime_type=file_meta.mime_type + f_url=f_url, mime_type=file_meta.mime_type, export_links=file_meta.export_links ) df = tabular_bytes_to_any_df( f_name=file_meta.name, f_bytes=f_bytes, mime_type=mime_type diff --git a/recipes/DocExtract.py b/recipes/DocExtract.py index 23cf89bfe..0fa063379 100644 --- a/recipes/DocExtract.py +++ b/recipes/DocExtract.py @@ -475,7 +475,9 @@ def process_source( elif is_video: f = furl(webpage_url) if is_gdrive_url(f): - f_bytes, _ = gdrive_download(f, doc_meta.mime_type) + f_bytes, _ = gdrive_download( + f, doc_meta.mime_type, doc_meta.export_links + ) webpage_url = upload_file_from_bytes( doc_meta.name, f_bytes, content_type=doc_meta.mime_type ) From 7032cb221fdfe01eec0acc460e55eae94fbfd445 Mon Sep 17 00:00:00 2001 From: milovate Date: Wed, 18 Dec 2024 19:35:54 +0530 Subject: [PATCH 6/7] feat: add author notes --- daras_ai_v2/office_utils_pptx.py | 125 ++++++++++++++----------------- 1 file changed, 57 insertions(+), 68 deletions(-) diff --git a/daras_ai_v2/office_utils_pptx.py b/daras_ai_v2/office_utils_pptx.py index e45843e9a..780841424 100644 --- a/daras_ai_v2/office_utils_pptx.py +++ b/daras_ai_v2/office_utils_pptx.py @@ -34,7 +34,11 @@ def pptx_to_text_pages(f: typing.BinaryIO, use_form_reco: bool = False) -> list[ except Exception as e: slide_content.append(f" Error processing shape: {e}") + if slide.has_notes_slide: + slide_content.extend(handle_author_notes(slide)) + slides_text.append("\n".join(slide_content) + "\n") + return slides_text @@ -43,81 +47,55 @@ def handle_text_elements(shape) -> list[str]: Handles text elements within a shape, including lists. """ text_elements = [] - is_a_list = False - is_list_group_created = False - enum_list_item_value = 0 - bullet_type = "None" - list_label = "LIST" namespaces = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"} - # Identify if shape contains lists + current_list_type = None + list_item_index = 0 + for paragraph in shape.text_frame.paragraphs: p = paragraph._element + paragraph_text = "" + is_list_item = False + + # Determine list type if p.find(".//a:buChar", namespaces=namespaces) is not None: - bullet_type = "Bullet" - is_a_list = True + current_list_type = "Bullet" + is_list_item = True elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None: - bullet_type = "Numbered" - is_a_list = True + current_list_type = "Numbered" + is_list_item = True + elif paragraph.level > 0: # Indented text is also treated as a list + current_list_type = "Bullet" + is_list_item = True else: - is_a_list = False - - if paragraph.level > 0: - is_a_list = True - - if is_a_list: - if bullet_type == "Numbered": - list_label = "ORDERED_LIST" - - # Iterate through paragraphs to build up text - for paragraph in shape.text_frame.paragraphs: - p = paragraph._element - enum_list_item_value += 1 - inline_paragraph_text = "" - inline_list_item_text = "" - doc_label = "PARAGRAPH" - - for e in p.iterfind(".//a:r", namespaces=namespaces): - if len(e.text.strip()) > 0: - e_is_a_list_item = False - is_numbered = False - if p.find(".//a:buChar", namespaces=namespaces) is not None: - bullet_type = "Bullet" - e_is_a_list_item = True - elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None: - bullet_type = "Numbered" - is_numbered = True - e_is_a_list_item = True - else: - e_is_a_list_item = False - - if e_is_a_list_item: - if len(inline_paragraph_text) > 0: - text_elements.append(inline_paragraph_text) - inline_list_item_text += e.text + current_list_type = None + list_item_index = 0 # Reset numbering if no list + + # Process paragraph text + for run in p.iterfind(".//a:r", namespaces=namespaces): + run_text = run.text.strip() if run.text else "" + if run_text: + paragraph_text += run_text + + if is_list_item: + if current_list_type == "Numbered": + list_item_index += 1 + list_prefix = f"{list_item_index}." + else: + list_prefix = "•" # Default bullet symbol + text_elements.append(f"{list_prefix} {paragraph_text}") + else: + # Handle placeholders for titles or subtitles + if shape.is_placeholder: + placeholder_type = shape.placeholder_format.type + if placeholder_type == PP_PLACEHOLDER.TITLE: + text_elements.append(f"TITLE: {paragraph_text}") + elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: + text_elements.append(f"SECTION_HEADER: {paragraph_text}") else: - if shape.is_placeholder: - placeholder_type = shape.placeholder_format.type - if placeholder_type in [ - PP_PLACEHOLDER.CENTER_TITLE, - PP_PLACEHOLDER.TITLE, - ]: - doc_label = "TITLE" - elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: - doc_label = "SECTION_HEADER" - enum_list_item_value = 0 - inline_paragraph_text += e.text - - if len(inline_paragraph_text) > 0: - text_elements.append(inline_paragraph_text) - - if len(inline_list_item_text) > 0: - enum_marker = "" - if is_numbered: - enum_marker = str(enum_list_item_value) + "." - if not is_list_group_created: - is_list_group_created = True - text_elements.append(f"{enum_marker} {inline_list_item_text}") + text_elements.append(paragraph_text) + else: + text_elements.append(paragraph_text) return text_elements @@ -171,7 +149,7 @@ def handle_tables(shape) -> list[str]: for row in grid[1:]: line = "|" + "|".join(row) + "|" table_text.append(line) - print(line) + # print(line) return table_text @@ -207,6 +185,17 @@ def handle_charts(shape) -> list[str]: return chart_text +def handle_author_notes(slide) -> list[str]: + + notes = [] + if slide.notes_slide.notes_text_frame: + notes_text = slide.notes_slide.notes_text_frame.text.strip() + if notes_text: + notes.append("Speaker Notes:") + notes.append(notes_text) + return notes + + # TODO :azure form reco to extract text from images def handle_pictures(shape): pass From cc67f670e80d52d731920e32e2d84a390549ae37 Mon Sep 17 00:00:00 2001 From: milovate Date: Fri, 20 Dec 2024 17:42:57 +0530 Subject: [PATCH 7/7] fix: update error handling --- daras_ai_v2/gdrive_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 3cc41c68f..3f18f7e1d 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -77,7 +77,7 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, if f_url_export := export_links.get(export_mime_type, None): r = requests.get(f_url_export) file_bytes = r.content - raise_for_status(r) + raise_for_status(r, is_user_url=True) return file_bytes, export_mime_type request = service.files().get_media(