From 47ff419775ceb267ea5a44d5eaf21889dbae587c Mon Sep 17 00:00:00 2001 From: rjzhb Date: Mon, 22 Apr 2024 09:51:50 +0000 Subject: [PATCH 1/5] add [Feature Request]: Read binary data (bytes) #12878 --- .../llama_index/core/readers/file/base.py | 81 +++++++++++++++++++ .../tests/readers/test_load_reader.py | 29 ++++++- 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py index 4da0885bf756e..b49a4926a4412 100644 --- a/llama-index-core/llama_index/core/readers/file/base.py +++ b/llama-index-core/llama_index/core/readers/file/base.py @@ -19,6 +19,30 @@ from llama_index.core.schema import Document from tqdm import tqdm +import magic +from io import BytesIO + +mime_to_extension = { + 'application/pdf': '.pdf', + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'text/plain': '.txt', + 'text/csv': '.csv', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', + 'application/vnd.ms-powerpoint': '.ppt', + 'application/vnd.ms-powerpoint.presentation.macroenabled.12': '.pptm', + 'application/vnd.hwp': '.hwp', + 'application/epub+zip': '.epub', + 'text/markdown': '.md', + 'application/mbox': '.mbox', + 'application/x-ipynb+json': '.ipynb', + 'audio/mpeg': '.mp3', + 'video/mp4': '.mp4', + 'image/jpeg': '.jpeg' +} + + def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]: try: @@ -642,3 +666,60 @@ def iter_data( if len(documents) > 0: yield documents + + + @staticmethod + def load_file_from_binary( + binary_data, + encoding: str = "utf-8", + errors: str = "ignore", + raise_on_error: bool = False, + ): + default_file_reader_cls = SimpleDirectoryReader.supported_suffix_fn() + default_file_reader_suffix = list(default_file_reader_cls.keys()) + documents: List[Document] = [] + metadata: Optional[dict] = None + + # use magic to get MIME type from binary data + mime_type = magic.from_buffer(binary_data, mime=True) + file_suffix = mime_to_extension.get(mime_type, '.bin') + + # use BytesIO to simulate file operation + fake_file = BytesIO(binary_data) + + if file_suffix in default_file_reader_suffix: + # use file readers + # instantiate file reader if not already + reader_cls = default_file_reader_cls[file_suffix] + reader = reader_cls() + + # load data -- catch all errors except for ImportError + try: + # use file readers + documents = reader.load_data(fake_file) + except ImportError as e: + # ensure that ImportError is raised so user knows + # about missing dependencies + raise ImportError(str(e)) + except Exception as e: + if raise_on_error: + raise Exception("Error loading file") from e + # otherwise, just skip the file and report the error + print( + f"Failed to load binary data {binary_data} with error: {e}. Skipping...", + flush=True, + ) + return [] + + else: + # do standard read + try: + fake_file.seek(0) + data = fake_file.read().decode(encoding=encoding, errors=errors) + doc = Document(text=data, metadata=metadata or {}) + documents.append(doc) + except Exception as e: + print(f"Failed to read or decode file: {e}") + return [] + + return documents diff --git a/llama-index-core/tests/readers/test_load_reader.py b/llama-index-core/tests/readers/test_load_reader.py index be0f1cdcd02cf..0ddfe885dc1b6 100644 --- a/llama-index-core/tests/readers/test_load_reader.py +++ b/llama-index-core/tests/readers/test_load_reader.py @@ -1,8 +1,9 @@ from typing import cast +from llama_index.core import SimpleDirectoryReader from llama_index.core.readers.loading import load_reader from llama_index.core.readers.string_iterable import StringIterableReader - +import magic def test_loading_readers() -> None: string_iterable = StringIterableReader() @@ -14,3 +15,29 @@ def test_loading_readers() -> None: ) assert loaded_string_iterable.is_remote == string_iterable.is_remote + +def test_load_text_file(): + # Create a sample text binary data + text_data = "Hello, this is a test.".encode('utf-8') + # Mock the MIME type identification to return 'text/plain' + magic.from_buffer = lambda x, mime: 'text/plain' + + # Call your function + documents = SimpleDirectoryReader.load_file_from_binary(text_data) + + # Assert that the document contains correct text + assert documents[0].text == "Hello, this is a test." + assert len(documents) == 1 + +def test_load_unsupported_file_type(): + # Create binary data for a non-text type that is not supported + binary_data = b'\x00\x01\x02\x03\x04' + # Mock the MIME type identification to return an unsupported type + magic.from_buffer = lambda x, mime: 'application/octet-stream' + + # Call your function, which should try to decode as text + documents = SimpleDirectoryReader.load_file_from_binary(binary_data) + + # Assert documents are attempted to be created as text (may result in gibberish or empty) + assert len(documents) == 1 + assert len(documents[0].text) >= 0 From bf662db08ca9772e3ce1a615ebe5ad8e3d56cf09 Mon Sep 17 00:00:00 2001 From: rjzhb Date: Mon, 22 Apr 2024 09:59:36 +0000 Subject: [PATCH 2/5] add [Feature Request]: Read binary data (bytes) #12878 --- llama-index-core/tests/readers/test_load_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama-index-core/tests/readers/test_load_reader.py b/llama-index-core/tests/readers/test_load_reader.py index 0ddfe885dc1b6..229ba9b797550 100644 --- a/llama-index-core/tests/readers/test_load_reader.py +++ b/llama-index-core/tests/readers/test_load_reader.py @@ -16,7 +16,7 @@ def test_loading_readers() -> None: assert loaded_string_iterable.is_remote == string_iterable.is_remote -def test_load_text_file(): +def test_load_binary_data_file(): # Create a sample text binary data text_data = "Hello, this is a test.".encode('utf-8') # Mock the MIME type identification to return 'text/plain' @@ -29,7 +29,7 @@ def test_load_text_file(): assert documents[0].text == "Hello, this is a test." assert len(documents) == 1 -def test_load_unsupported_file_type(): +def test_load_unsupported_binary_data_file_type(): # Create binary data for a non-text type that is not supported binary_data = b'\x00\x01\x02\x03\x04' # Mock the MIME type identification to return an unsupported type From 6b4f10134f414609ec9c7d44e0f5677ae083e002 Mon Sep 17 00:00:00 2001 From: rjzhb Date: Mon, 22 Apr 2024 10:05:50 +0000 Subject: [PATCH 3/5] add [Feature Request]: Read binary data (bytes) #12878 --- llama-index-core/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/llama-index-core/pyproject.toml b/llama-index-core/pyproject.toml index 78da78d128728..e8de7d96a6c1c 100644 --- a/llama-index-core/pyproject.toml +++ b/llama-index-core/pyproject.toml @@ -53,6 +53,7 @@ deprecated = ">=1.2.9.3" fsspec = ">=2023.5.0" httpx = "*" langchain = {optional = true, version = ">=0.0.303"} +magic = "^0.4.22" # Added Python magic package nest-asyncio = "^1.5.8" nltk = "^3.8.1" numpy = "*" From 9167d19808d4694d5e436295b419d6efe2293f4a Mon Sep 17 00:00:00 2001 From: rjzhb Date: Mon, 22 Apr 2024 12:06:43 +0000 Subject: [PATCH 4/5] add [Feature Request]: Read binary data (bytes) #12878 --- .../llama_index/core/readers/file/base.py | 104 ++++++++---------- llama-index-core/poetry.lock | 50 ++++++++- llama-index-core/pyproject.toml | 3 +- .../tests/readers/test_load_reader.py | 35 ++++-- 4 files changed, 123 insertions(+), 69 deletions(-) diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py index b49a4926a4412..a0b7591a58859 100644 --- a/llama-index-core/llama_index/core/readers/file/base.py +++ b/llama-index-core/llama_index/core/readers/file/base.py @@ -4,6 +4,7 @@ import logging import mimetypes import multiprocessing +import tempfile import warnings from datetime import datetime from functools import reduce @@ -22,26 +23,36 @@ import magic from io import BytesIO -mime_to_extension = { - 'application/pdf': '.pdf', - 'image/jpeg': '.jpg', - 'image/png': '.png', - 'text/plain': '.txt', - 'text/csv': '.csv', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', - 'application/vnd.ms-powerpoint': '.ppt', - 'application/vnd.ms-powerpoint.presentation.macroenabled.12': '.pptm', - 'application/vnd.hwp': '.hwp', - 'application/epub+zip': '.epub', - 'text/markdown': '.md', - 'application/mbox': '.mbox', - 'application/x-ipynb+json': '.ipynb', - 'audio/mpeg': '.mp3', - 'video/mp4': '.mp4', - 'image/jpeg': '.jpeg' -} - +def _try_loading_file_extension_by_mime_type() -> Dict[str, str]: + """ + Returns a dictionary mapping MIME types to their corresponding file extensions. + Attempts to import the 'magic' module, which is used for file type identification. + """ + try: + import magic + except ImportError: + raise ImportError("The 'magic' module is not installed. Please install it to enable MIME type detection.") + + mime_to_extension = { + 'application/pdf': '.pdf', + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'text/plain': '.txt', + 'text/csv': '.csv', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', + 'application/vnd.ms-powerpoint': '.ppt', + 'application/vnd.ms-powerpoint.presentation.macroenabled.12': '.pptm', + 'application/vnd.hwp': '.hwp', + 'application/epub+zip': '.epub', + 'text/markdown': '.md', + 'application/mbox': '.mbox', + 'application/x-ipynb+json': '.ipynb', + 'audio/mpeg': '.mp3', + 'video/mp4': '.mp4', + 'image/jpeg': '.jpeg' # This entry will take precedence over the previous '.jpg' entry for 'image/jpeg' + } + return mime_to_extension def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]: @@ -196,6 +207,7 @@ class SimpleDirectoryReader(BaseReader): """ supported_suffix_fn: Callable = _try_loading_included_file_formats + mime_types_fn: Callable = _try_loading_file_extension_by_mime_type def __init__( self, @@ -675,51 +687,25 @@ def load_file_from_binary( errors: str = "ignore", raise_on_error: bool = False, ): - default_file_reader_cls = SimpleDirectoryReader.supported_suffix_fn() - default_file_reader_suffix = list(default_file_reader_cls.keys()) + default_mime_types_map = SimpleDirectoryReader.mime_types_fn() documents: List[Document] = [] - metadata: Optional[dict] = None # use magic to get MIME type from binary data mime_type = magic.from_buffer(binary_data, mime=True) - file_suffix = mime_to_extension.get(mime_type, '.bin') - - # use BytesIO to simulate file operation - fake_file = BytesIO(binary_data) + file_suffix = default_mime_types_map.get(mime_type, '.bin') - if file_suffix in default_file_reader_suffix: - # use file readers - # instantiate file reader if not already - reader_cls = default_file_reader_cls[file_suffix] - reader = reader_cls() + try: + # save a tempfile + with tempfile.NamedTemporaryFile(suffix=file_suffix, delete=False) as temp_file: + temp_file.write(binary_data) + temp_file.flush() + temp_filename = temp_file.name - # load data -- catch all errors except for ImportError - try: - # use file readers - documents = reader.load_data(fake_file) - except ImportError as e: - # ensure that ImportError is raised so user knows - # about missing dependencies - raise ImportError(str(e)) - except Exception as e: - if raise_on_error: - raise Exception("Error loading file") from e - # otherwise, just skip the file and report the error - print( - f"Failed to load binary data {binary_data} with error: {e}. Skipping...", - flush=True, - ) - return [] + documents = SimpleDirectoryReader.load_file(Path(temp_filename), None, {}) - else: - # do standard read - try: - fake_file.seek(0) - data = fake_file.read().decode(encoding=encoding, errors=errors) - doc = Document(text=data, metadata=metadata or {}) - documents.append(doc) - except Exception as e: - print(f"Failed to read or decode file: {e}") - return [] + finally: + # Ensure the temporary file is deleted + if os.path.exists(temp_filename): + os.remove(temp_filename) return documents diff --git a/llama-index-core/poetry.lock b/llama-index-core/poetry.lock index 78559fb3f7fd0..46ba72a351d4a 100644 --- a/llama-index-core/poetry.lock +++ b/llama-index-core/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "accelerate" @@ -744,6 +744,17 @@ files = [ {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, ] +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.3.2" @@ -4640,6 +4651,7 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -4648,6 +4660,8 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, @@ -5193,6 +5207,17 @@ files = [ {file = "python_json_logger-2.0.7-py3-none-any.whl", hash = "sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd"}, ] +[[package]] +name = "python-magic" +version = "0.4.27" +description = "File type identification using libmagic" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, + {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, +] + [[package]] name = "pytz" version = "2024.1" @@ -5267,6 +5292,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -5591,6 +5617,26 @@ files = [ {file = "regex-2024.4.16.tar.gz", hash = "sha256:fa454d26f2e87ad661c4f0c5a5fe4cf6aab1e307d1b94f16ffdfcb089ba685c0"}, ] +[[package]] +name = "reportlab" +version = "4.2.0" +description = "The Reportlab Toolkit" +optional = false +python-versions = "<4,>=3.7" +files = [ + {file = "reportlab-4.2.0-py3-none-any.whl", hash = "sha256:53630f9d25a7938def3e6a93d723b72a7a5921d34d23cf7a0930adeb2cb0e6c1"}, + {file = "reportlab-4.2.0.tar.gz", hash = "sha256:474fb28d63431a5d47d75c90d580393050df7d491a09c7877df3291a2e9f6d0a"}, +] + +[package.dependencies] +chardet = "*" +pillow = ">=9.0.0" + +[package.extras] +accel = ["rl-accel (>=0.9.0,<1.1)"] +pycairo = ["freetype-py (>=2.3.0,<2.4)", "rlPyCairo (>=0.2.0,<1)"] +renderpm = ["rl-renderPM (>=4.0.3,<4.1)"] + [[package]] name = "requests" version = "2.31.0" @@ -8114,4 +8160,4 @@ query-tools = ["guidance", "jsonpath-ng", "lm-format-enforcer", "rank-bm25", "sc [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "df2c4bac840de5482a6b2304c5db1c3110914a53c24af7107747972d641f9b85" +content-hash = "65ffffdf23a6fb8f81de1f04d29acadee749210aff40fdab35270fe5f7cf6556" diff --git a/llama-index-core/pyproject.toml b/llama-index-core/pyproject.toml index e8de7d96a6c1c..0ecaa74427d7e 100644 --- a/llama-index-core/pyproject.toml +++ b/llama-index-core/pyproject.toml @@ -53,7 +53,7 @@ deprecated = ">=1.2.9.3" fsspec = ">=2023.5.0" httpx = "*" langchain = {optional = true, version = ">=0.0.303"} -magic = "^0.4.22" # Added Python magic package +python-magic = "^0.4.22" # Added Python magic package nest-asyncio = "^1.5.8" nltk = "^3.8.1" numpy = "*" @@ -86,6 +86,7 @@ pillow = ">=9.0.0" PyYAML = ">=6.0.1" llamaindex-py-client = "^0.1.18" wrapt = "*" +reportlab = "*" [tool.poetry.extras] gradientai = [ diff --git a/llama-index-core/tests/readers/test_load_reader.py b/llama-index-core/tests/readers/test_load_reader.py index 229ba9b797550..3994ddaaa3f51 100644 --- a/llama-index-core/tests/readers/test_load_reader.py +++ b/llama-index-core/tests/readers/test_load_reader.py @@ -1,3 +1,7 @@ +import io + +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas from typing import cast from llama_index.core import SimpleDirectoryReader @@ -17,16 +21,33 @@ def test_loading_readers() -> None: assert loaded_string_iterable.is_remote == string_iterable.is_remote def test_load_binary_data_file(): - # Create a sample text binary data - text_data = "Hello, this is a test.".encode('utf-8') - # Mock the MIME type identification to return 'text/plain' - magic.from_buffer = lambda x, mime: 'text/plain' + # Create a BytesIO object to store the PDF data in memory + pdf_bytes = io.BytesIO() + + # Create a PDF canvas + c = canvas.Canvas(pdf_bytes, pagesize=letter) + + # Add text content to the PDF + c.drawString(100, 750, "Hello, this is a PDF file.") + + # Close the PDF canvas + c.save() + + # Reset the position pointer of the BytesIO object to the beginning + pdf_bytes.seek(0) + + # Read the binary data of the PDF + pdf_data = pdf_bytes.read() + + + # Mock the MIME type identification to return 'application/pdf' + magic.from_buffer = lambda x, mime: 'application/pdf' - # Call your function - documents = SimpleDirectoryReader.load_file_from_binary(text_data) + # Call the function under test + documents = SimpleDirectoryReader.load_file_from_binary(pdf_data) # Assert that the document contains correct text - assert documents[0].text == "Hello, this is a test." + assert documents[0].text == "Hello, this is a PDF file.\n" assert len(documents) == 1 def test_load_unsupported_binary_data_file_type(): From 4fd0f4ac72a563cb1cf51555c6349b8fff445b80 Mon Sep 17 00:00:00 2001 From: rjzhb Date: Mon, 22 Apr 2024 12:08:27 +0000 Subject: [PATCH 5/5] update --- llama-index-core/llama_index/core/readers/file/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py index a0b7591a58859..919b82d6732fd 100644 --- a/llama-index-core/llama_index/core/readers/file/base.py +++ b/llama-index-core/llama_index/core/readers/file/base.py @@ -5,6 +5,7 @@ import mimetypes import multiprocessing import tempfile +import magic import warnings from datetime import datetime from functools import reduce @@ -20,8 +21,6 @@ from llama_index.core.schema import Document from tqdm import tqdm -import magic -from io import BytesIO def _try_loading_file_extension_by_mime_type() -> Dict[str, str]: """