Skip to content

Commit

Permalink
feat: improve type
Browse files Browse the repository at this point in the history
  • Loading branch information
himself65 committed Nov 7, 2024
1 parent 960a06a commit 0de0b68
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 27 deletions.
20 changes: 19 additions & 1 deletion llama-index-core/llama_index/core/readers/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,25 @@ class DirectoryReaderArgs(BaseDirectoryReaderArgs):
num_files_limit: NotRequired[Optional[int]]


class SimpleDirectoryReader(BaseReader, ResourcesReaderMixin, FileSystemReaderMixin):
class DirectoryReaderData:
"""
Base data for directory readers.
"""

exclude: Optional[List] = None
exclude_hidden: Optional[bool] = None
encoding: Optional[str] = None
errors: Optional[str] = None
recursive: Optional[bool] = None
filename_as_id: Optional[bool] = None
required_exts: Optional[List[str]] = None
raise_on_error: Optional[bool] = None
num_files_limit: Optional[int] = None


class SimpleDirectoryReader(
BaseReader, ResourcesReaderMixin, FileSystemReaderMixin, DirectoryReaderData
):
"""
Simple directory reader.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import logging
import tempfile
from typing import List, Optional, Dict, Any, Union
from typing import List, Optional, Dict, Any
from pathlib import Path
from abc import abstractmethod

from llama_index.core.readers import SimpleDirectoryReader, FileSystemReaderMixin
from typing_extensions import Unpack

from llama_index.core.readers import (
SimpleDirectoryReader,
FileSystemReaderMixin,
DirectoryReaderArgs,
)
from llama_index.core.readers.base import (
BaseReader,
ResourcesReaderMixin,
)
from llama_index.core.schema import Document
from llama_index.core.bridge.pydantic import Field

from llama_index.readers.box.BoxAPI.box_api import (
add_extra_header_to_box_client,
Expand Down Expand Up @@ -47,7 +52,9 @@ def class_name(cls) -> str:
def __init__(
self,
box_client: BoxClient,
**kwargs: Unpack[DirectoryReaderArgs],
):
super().__init__(**kwargs)
self._box_client = add_extra_header_to_box_client(box_client)

@abstractmethod
Expand Down Expand Up @@ -266,23 +273,15 @@ class BoxReader(BoxReaderBase):
Attributes:
_box_client (BoxClient): An authenticated Box client object used
for interacting with the Box API.
file_extractor (Optional[Dict[str, Union[str, BaseReader]]], optional):
A dictionary mapping file extensions or mimetypes to either a string
specifying a custom extractor function or another BaseReader subclass
for handling specific file formats. Defaults to None.
**kwargs: Additional keyword arguments passed to SimpleDirectoryReader.
"""

file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = Field(
default=None, exclude=True
)

def __init__(
self,
box_client: BoxClient,
file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
**kwargs: Unpack[DirectoryReaderArgs],
):
super().__init__(box_client=box_client)
self.file_extractor = file_extractor
super().__init__(box_client=box_client, **kwargs)

def load_data(
self,
Expand Down Expand Up @@ -345,7 +344,7 @@ def get_metadata(filename: str) -> Any:
simple_loader = SimpleDirectoryReader(
input_dir=temp_dir,
file_metadata=get_metadata,
file_extractor=self.file_extractor,
**self.model_dump(),
)
return simple_loader.load_data()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from llama_index.core.readers import SimpleDirectoryReader, DirectoryReaderArgs
from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.readers.file.base import DirectoryReaderData
from llama_index.core.schema import Document
from llama_index.core.bridge.pydantic import PrivateAttr, BaseModel
from llama_index.core.readers import FileSystemReaderMixin
Expand All @@ -32,7 +33,9 @@ class _OneDriveResourcePayload(BaseModel):
downloaded_file_path: Optional[str]


class OneDriveReader(BasePydanticReader, ResourcesReaderMixin, FileSystemReaderMixin):
class OneDriveReader(
BasePydanticReader, ResourcesReaderMixin, FileSystemReaderMixin, DirectoryReaderData
):
"""
Microsoft OneDrive reader.
Expand All @@ -54,7 +57,7 @@ class OneDriveReader(BasePydanticReader, ResourcesReaderMixin, FileSystemReaderM
:param file_paths (List[str], optional): List of specific file paths to download. Will be used if the parameter is not provided when calling load_data().
:param file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that file to text.
See `SimpleDirectoryReader` for more details.
:param required_exts (Optional[List[str]]): List of required extensions. Default is None.
:param **kwargs (Unpack[DirectoryReaderArgs]): Additional arguments to pass to the directory reader.
For interactive authentication to work, a browser is used to authenticate, hence the registered application should have a redirect URI set to 'https://localhost'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.readers.base import BaseReader
from typing_extensions import Unpack

from llama_index.core.readers import SimpleDirectoryReader, DirectoryReaderArgs
from llama_index.core.readers.base import BaseReader, BasePydanticReader
from llama_index.core.schema import Document


class MinioReader(BaseReader):
class MinioReader(BaseReader, BasePydanticReader):
"""General reader for any Minio file or directory."""

def __init__(
Expand All @@ -33,7 +35,7 @@ def __init__(
minio_access_key: Optional[str] = None,
minio_secret_key: Optional[str] = None,
minio_session_token: Optional[str] = None,
**kwargs: Any,
**kwargs: Unpack[DirectoryReaderArgs],
) -> None:
"""Initialize Minio bucket and key, along with credentials if needed.
Expand All @@ -45,11 +47,6 @@ def __init__(
this loader will iterate through the entire bucket.
prefix (Optional[str]): the prefix to filter by in the case that the loader
iterates through the entire bucket. Defaults to empty string.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. See `SimpleDirectoryReader` for more details.
required_exts (Optional[List[str]]): List of required extensions.
Default is None.
num_files_limit (Optional[int]): Maximum number of files to read.
Default is None.
file_metadata (Optional[Callable[str, Dict]]): A function that takes
Expand All @@ -62,6 +59,7 @@ def __init__(
minio_session_token (Optional[str]): The Minio session token.
minio_secure: MinIO server runs in TLS mode
minio_cert_check: allows the usage of a self-signed cert for MinIO server
**kwargs: Additional arguments to pass to the simple directory reader.
"""
super().__init__(*args, **kwargs)

Expand Down

0 comments on commit 0de0b68

Please sign in to comment.