Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve egnyte connector #3626

Merged
merged 1 commit into from
Jan 8, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 25 additions & 27 deletions backend/onyx/connectors/egnyte/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
def _get_files_list(
self,
path: str,
) -> list[dict[str, Any]]:
) -> Generator[dict[str, Any], None, None]:
if not self.access_token or not self.domain:
raise ConnectorMissingCredentialError("Egnyte")

Expand All @@ -245,48 +245,46 @@ def _get_files_list(
raise RuntimeError(f"Failed to fetch files from Egnyte: {response.text}")

data = response.json()
all_files: list[dict[str, Any]] = []

# Add files from current directory
all_files.extend(data.get("files", []))
# Yield files from current directory
for file in data.get("files", []):
yield file

# Recursively traverse folders
for item in data.get("folders", []):
all_files.extend(self._get_files_list(item["path"]))
for folder in data.get("folders", []):
yield from self._get_files_list(folder["path"])

return all_files

def _filter_files(
def _should_index_file(
self,
files: list[dict[str, Any]],
file: dict[str, Any],
start_time: datetime | None = None,
end_time: datetime | None = None,
) -> list[dict[str, Any]]:
filtered_files = []
for file in files:
if file["is_folder"]:
continue

file_modified = _parse_last_modified(file["last_modified"])
if start_time and file_modified < start_time:
continue
if end_time and file_modified > end_time:
continue
) -> bool:
"""Return True if file should be included based on filters."""
if file["is_folder"]:
return False

filtered_files.append(file)
file_modified = _parse_last_modified(file["last_modified"])
if start_time and file_modified < start_time:
return False
if end_time and file_modified > end_time:
return False

return filtered_files
return True

def _process_files(
self,
start_time: datetime | None = None,
end_time: datetime | None = None,
) -> Generator[list[Document], None, None]:
files = self._get_files_list(self.folder_path)
files = self._filter_files(files, start_time, end_time)

current_batch: list[Document] = []
for file in files:

# Iterate through yielded files and filter them
for file in self._get_files_list(self.folder_path):
if not self._should_index_file(file, start_time, end_time):
logger.debug(f"Skipping file '{file['path']}'.")
continue

try:
# Set up request with streaming enabled
headers = {
Expand Down
Loading