diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py index 4da0885bf756e..919b82d6732fd 100644 --- a/llama-index-core/llama_index/core/readers/file/base.py +++ b/llama-index-core/llama_index/core/readers/file/base.py @@ -4,6 +4,8 @@ import logging import mimetypes import multiprocessing +import tempfile +import magic import warnings from datetime import datetime from functools import reduce @@ -20,6 +22,38 @@ from tqdm import tqdm +def _try_loading_file_extension_by_mime_type() -> Dict[str, str]: + """ + Returns a dictionary mapping MIME types to their corresponding file extensions. + Attempts to import the 'magic' module, which is used for file type identification. + """ + try: + import magic + except ImportError: + raise ImportError("The 'magic' module is not installed. Please install it to enable MIME type detection.") + + mime_to_extension = { + 'application/pdf': '.pdf', + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'text/plain': '.txt', + 'text/csv': '.csv', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', + 'application/vnd.ms-powerpoint': '.ppt', + 'application/vnd.ms-powerpoint.presentation.macroenabled.12': '.pptm', + 'application/vnd.hwp': '.hwp', + 'application/epub+zip': '.epub', + 'text/markdown': '.md', + 'application/mbox': '.mbox', + 'application/x-ipynb+json': '.ipynb', + 'audio/mpeg': '.mp3', + 'video/mp4': '.mp4', + 'image/jpeg': '.jpeg' # This entry will take precedence over the previous '.jpg' entry for 'image/jpeg' + } + return mime_to_extension + + def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]: try: from llama_index.readers.file import ( @@ -172,6 +206,7 @@ class SimpleDirectoryReader(BaseReader): """ supported_suffix_fn: Callable = _try_loading_included_file_formats + mime_types_fn: Callable = _try_loading_file_extension_by_mime_type def __init__( self, @@ -642,3 +677,34 @@ def iter_data( if len(documents) > 0: yield documents + + + @staticmethod + def load_file_from_binary( + binary_data, + encoding: str = "utf-8", + errors: str = "ignore", + raise_on_error: bool = False, + ): + default_mime_types_map = SimpleDirectoryReader.mime_types_fn() + documents: List[Document] = [] + + # use magic to get MIME type from binary data + mime_type = magic.from_buffer(binary_data, mime=True) + file_suffix = default_mime_types_map.get(mime_type, '.bin') + + try: + # save a tempfile + with tempfile.NamedTemporaryFile(suffix=file_suffix, delete=False) as temp_file: + temp_file.write(binary_data) + temp_file.flush() + temp_filename = temp_file.name + + documents = SimpleDirectoryReader.load_file(Path(temp_filename), None, {}) + + finally: + # Ensure the temporary file is deleted + if os.path.exists(temp_filename): + os.remove(temp_filename) + + return documents diff --git a/llama-index-core/poetry.lock b/llama-index-core/poetry.lock index 78559fb3f7fd0..46ba72a351d4a 100644 --- a/llama-index-core/poetry.lock +++ b/llama-index-core/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "accelerate" @@ -744,6 +744,17 @@ files = [ {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, ] +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.3.2" @@ -4640,6 +4651,7 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -4648,6 +4660,8 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, @@ -5193,6 +5207,17 @@ files = [ {file = "python_json_logger-2.0.7-py3-none-any.whl", hash = "sha256:f380b826a991ebbe3de4d897aeec42760035ac760345e57b812938dc8b35e2bd"}, ] +[[package]] +name = "python-magic" +version = "0.4.27" +description = "File type identification using libmagic" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, + {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, +] + [[package]] name = "pytz" version = "2024.1" @@ -5267,6 +5292,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -5591,6 +5617,26 @@ files = [ {file = "regex-2024.4.16.tar.gz", hash = "sha256:fa454d26f2e87ad661c4f0c5a5fe4cf6aab1e307d1b94f16ffdfcb089ba685c0"}, ] +[[package]] +name = "reportlab" +version = "4.2.0" +description = "The Reportlab Toolkit" +optional = false +python-versions = "<4,>=3.7" +files = [ + {file = "reportlab-4.2.0-py3-none-any.whl", hash = "sha256:53630f9d25a7938def3e6a93d723b72a7a5921d34d23cf7a0930adeb2cb0e6c1"}, + {file = "reportlab-4.2.0.tar.gz", hash = "sha256:474fb28d63431a5d47d75c90d580393050df7d491a09c7877df3291a2e9f6d0a"}, +] + +[package.dependencies] +chardet = "*" +pillow = ">=9.0.0" + +[package.extras] +accel = ["rl-accel (>=0.9.0,<1.1)"] +pycairo = ["freetype-py (>=2.3.0,<2.4)", "rlPyCairo (>=0.2.0,<1)"] +renderpm = ["rl-renderPM (>=4.0.3,<4.1)"] + [[package]] name = "requests" version = "2.31.0" @@ -8114,4 +8160,4 @@ query-tools = ["guidance", "jsonpath-ng", "lm-format-enforcer", "rank-bm25", "sc [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "df2c4bac840de5482a6b2304c5db1c3110914a53c24af7107747972d641f9b85" +content-hash = "65ffffdf23a6fb8f81de1f04d29acadee749210aff40fdab35270fe5f7cf6556" diff --git a/llama-index-core/pyproject.toml b/llama-index-core/pyproject.toml index 78da78d128728..0ecaa74427d7e 100644 --- a/llama-index-core/pyproject.toml +++ b/llama-index-core/pyproject.toml @@ -53,6 +53,7 @@ deprecated = ">=1.2.9.3" fsspec = ">=2023.5.0" httpx = "*" langchain = {optional = true, version = ">=0.0.303"} +python-magic = "^0.4.22" # Added Python magic package nest-asyncio = "^1.5.8" nltk = "^3.8.1" numpy = "*" @@ -85,6 +86,7 @@ pillow = ">=9.0.0" PyYAML = ">=6.0.1" llamaindex-py-client = "^0.1.18" wrapt = "*" +reportlab = "*" [tool.poetry.extras] gradientai = [ diff --git a/llama-index-core/tests/readers/test_load_reader.py b/llama-index-core/tests/readers/test_load_reader.py index be0f1cdcd02cf..3994ddaaa3f51 100644 --- a/llama-index-core/tests/readers/test_load_reader.py +++ b/llama-index-core/tests/readers/test_load_reader.py @@ -1,8 +1,13 @@ +import io + +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas from typing import cast +from llama_index.core import SimpleDirectoryReader from llama_index.core.readers.loading import load_reader from llama_index.core.readers.string_iterable import StringIterableReader - +import magic def test_loading_readers() -> None: string_iterable = StringIterableReader() @@ -14,3 +19,46 @@ def test_loading_readers() -> None: ) assert loaded_string_iterable.is_remote == string_iterable.is_remote + +def test_load_binary_data_file(): + # Create a BytesIO object to store the PDF data in memory + pdf_bytes = io.BytesIO() + + # Create a PDF canvas + c = canvas.Canvas(pdf_bytes, pagesize=letter) + + # Add text content to the PDF + c.drawString(100, 750, "Hello, this is a PDF file.") + + # Close the PDF canvas + c.save() + + # Reset the position pointer of the BytesIO object to the beginning + pdf_bytes.seek(0) + + # Read the binary data of the PDF + pdf_data = pdf_bytes.read() + + + # Mock the MIME type identification to return 'application/pdf' + magic.from_buffer = lambda x, mime: 'application/pdf' + + # Call the function under test + documents = SimpleDirectoryReader.load_file_from_binary(pdf_data) + + # Assert that the document contains correct text + assert documents[0].text == "Hello, this is a PDF file.\n" + assert len(documents) == 1 + +def test_load_unsupported_binary_data_file_type(): + # Create binary data for a non-text type that is not supported + binary_data = b'\x00\x01\x02\x03\x04' + # Mock the MIME type identification to return an unsupported type + magic.from_buffer = lambda x, mime: 'application/octet-stream' + + # Call your function, which should try to decode as text + documents = SimpleDirectoryReader.load_file_from_binary(binary_data) + + # Assert documents are attempted to be created as text (may result in gibberish or empty) + assert len(documents) == 1 + assert len(documents[0].text) >= 0