Skip to content

Commit

Permalink
Add title retrieval to Notion crawler (#765)
Browse files Browse the repository at this point in the history
* Removed redundant sanitization

* Added title retrieval, updated tests accordingly

* Modified map_metadata to include text in embeddedString, updated tests accordingly

* Stopped adding title to document text in extractor manually, updated tests so that map_metadata handles the title injection

* Updated _get_title to minimize API request

* bumped version

* removed redundant title assignment in test

* Fix mypy issues

* make title blank if error

* fixed _get_title URL construction

* dropped explicit endpoint

* Added test for get_title, bumped version

* Updated tests to check exceptions, added exits to extractor on significant exceptions

* HTTPError adjustment

* Added _description and _platform attributes

* Bumped version / updated lockfile
  • Loading branch information
rishimo authored Jan 31, 2024
1 parent ca4386a commit 568318a
Show file tree
Hide file tree
Showing 9 changed files with 588 additions and 531 deletions.
6 changes: 3 additions & 3 deletions metaphor/common/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ def map_metadata(
}

if include_text:
embedding_dict[nodeid]["embeddedString_1"] = sanitize_text(
doc_store[nodeid]["__data__"]["text"]
)
chunk_text = doc_store[nodeid]["__data__"]["text"]
title = metadata_dict[nodeid]["title"]
embedding_dict[nodeid]["embeddedString_1"] = f"Title: {title}\n{chunk_text}"

out.append({"externalSearchDocument": embedding_dict[nodeid]})

Expand Down
3 changes: 2 additions & 1 deletion metaphor/notion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Create a YAML config file based on the following template.
notion_api_token: <notion_api_token>

azure_openAI_key: <azure_openAI_key>

azure_openAI_endpoint: <azure_openAI_endpoint>
```
### Optional Configurations
Expand All @@ -30,7 +32,6 @@ These defaults are provided; you don't have to manually configure them.

```yaml
azure_openAI_version: <azure_openAI_version> # "2023-12-01-preview"
azure_openAI_endpoint: <azure_openAI_endpoint> # "https://metaphorazureopenairesource.openai.azure.com/"
azure_openAI_model_name: <azure_openAI_model_name> # "Embedding_ada002"
azure_openAI_model: <azure_openAI_model> # "text-embedding-ada-002"
Expand Down
2 changes: 1 addition & 1 deletion metaphor/notion/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ class NotionRunConfig(BaseConfig):

# Azure OpenAI services configs
azure_openAI_key: str
azure_openAI_endpoint: str

# Default Azure OpenAI services configs
azure_openAI_version: str = "2023-12-01-preview"
azure_openAI_endpoint: str = "https://metaphorazureopenairesource.openai.azure.com/"
azure_openAI_model: str = "text-embedding-ada-002"
azure_openAI_model_name: str = "Embedding_ada002"

Expand Down
37 changes: 34 additions & 3 deletions metaphor/notion/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from metaphor.common.base_extractor import BaseExtractor
from metaphor.common.embeddings import embed_documents, map_metadata, sanitize_text
from metaphor.common.logger import get_logger
from metaphor.models.crawler_run_metadata import Platform
from metaphor.notion.config import NotionRunConfig

logger = get_logger()
Expand All @@ -24,6 +25,9 @@
class NotionExtractor(BaseExtractor):
"""Notion Document extractor."""

_description = "Notion document crawler"
_platform = Platform.UNKNOWN

@staticmethod
def from_config_file(config_file: str) -> "NotionExtractor":
return NotionExtractor(NotionRunConfig.from_yaml_file(config_file))
Expand Down Expand Up @@ -71,6 +75,28 @@ async def extract(self) -> Collection[dict]:
# Each document dict has nodeId, embedding, lastRefreshed, metadata
return embedded_nodes

def _get_title(self, page: str) -> str:
headers = {
"Authorization": f"Bearer {self.notion_api_token}",
"Notion-Version": f"{self.notion_api_version}",
}

try:
r = requests.get(
f"{baseURL}/pages/{page}/properties/title", headers=headers, timeout=5
)
r.raise_for_status()

# Extract title
title = r.json()["results"][0]["title"]["plain_text"]

except (HTTPError, KeyError) as error:
traceback.print_exc()
logger.warning(f"Failed to get title for page {page}, err: {error}")
title = ""

return title

def _get_databases(self) -> None:
"""
Returns a list of database IDs.
Expand All @@ -95,6 +121,7 @@ def _get_databases(self) -> None:
except HTTPError as error:
traceback.print_exc()
logger.error(f"Failed to get Notion database IDs, error {error}")
raise error

# Load JSON response
dbs = json.loads(r.content)["results"]
Expand Down Expand Up @@ -134,16 +161,20 @@ def _get_all_documents(self) -> Sequence[Document]:

# Update queried document metadata with db_id, platform info, link
for q in queried:
# Reset page-id, remove hyphens
q.metadata["pageId"] = q.metadata.pop("page_id").replace("-", "")

# Add title to metadata
title = self._get_title(q.metadata["pageId"])
q.metadata["title"] = title

# Clean the document text
q.text = sanitize_text(q.text)

# Update db_id and platform
q.metadata["dbId"] = db_id.replace("-", "") # remove hyphens
q.metadata["platform"] = "notion"

# Reset page-id, remove hyphens
q.metadata["pageId"] = q.metadata.pop("page_id").replace("-", "")

# Construct link
link = f'https://notion.so/{q.metadata["pageId"]}'
q.metadata["link"] = link
Expand Down
Loading

0 comments on commit 568318a

Please sign in to comment.