More explicit Confluence Connector (#2289)

onyx-dot-app · Sep 2, 2024 · c122be2 · c122be2
1 parent f871b4c
commit c122be2
Show file tree

Hide file tree

Showing 6 changed files with 236 additions and 98 deletions.
diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml
@@ -10,6 +10,9 @@ on:
 env:
   # Confluence
   CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
   CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
   CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
 

diff --git a/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py b/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
@@ -0,0 +1,158 @@
+"""migration confluence to be explicit
+
+Revision ID: a3795dce87be
+Revises: 1f60f60c3401
+Create Date: 2024-09-01 13:52:12.006740
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.sql import table, column
+
+revision = "a3795dce87be"
+down_revision = "1f60f60c3401"
+branch_labels = None
+depends_on = None
+
+
+def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
+    from urllib.parse import urlparse
+
+    def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
+        parsed_url = urlparse(wiki_url)
+        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
+        path_parts = parsed_url.path.split("/")
+        space = path_parts[3]
+        page_id = path_parts[5] if len(path_parts) > 5 else ""
+        return wiki_base, space, page_id
+
+    def _extract_confluence_keys_from_datacenter_url(
+        wiki_url: str,
+    ) -> tuple[str, str, str]:
+        DISPLAY = "/display/"
+        PAGE = "/pages/"
+        parsed_url = urlparse(wiki_url)
+        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
+        space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
+        page_id = ""
+        if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
+            page_id = content[1]
+        return wiki_base, space, page_id
+
+    is_confluence_cloud = (
+        ".atlassian.net/wiki/spaces/" in wiki_url
+        or ".jira.com/wiki/spaces/" in wiki_url
+    )
+
+    if is_confluence_cloud:
+        wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
+    else:
+        wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
+            wiki_url
+        )
+
+    return wiki_base, space, page_id, is_confluence_cloud
+
+
+def reconstruct_confluence_url(
+    wiki_base: str, space: str, page_id: str, is_cloud: bool
+) -> str:
+    if is_cloud:
+        url = f"{wiki_base}/spaces/{space}"
+        if page_id:
+            url += f"/pages/{page_id}"
+    else:
+        url = f"{wiki_base}/display/{space}"
+        if page_id:
+            url += f"/pages/{page_id}"
+    return url
+
+
+def upgrade() -> None:
+    connector = table(
+        "connector",
+        column("id", sa.Integer),
+        column("source", sa.String()),
+        column("input_type", sa.String()),
+        column("connector_specific_config", postgresql.JSONB),
+    )
+
+    # Fetch all Confluence connectors
+    connection = op.get_bind()
+    confluence_connectors = connection.execute(
+        sa.select(connector).where(
+            sa.and_(
+                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
+            )
+        )
+    ).fetchall()
+
+    for row in confluence_connectors:
+        config = row.connector_specific_config
+        wiki_page_url = config["wiki_page_url"]
+        wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
+            wiki_page_url
+        )
+
+        new_config = {
+            "wiki_base": wiki_base,
+            "space": space,
+            "page_id": page_id,
+            "is_cloud": is_cloud,
+        }
+
+        for key, value in config.items():
+            if key not in ["wiki_page_url"]:
+                new_config[key] = value
+
+        op.execute(
+            connector.update()
+            .where(connector.c.id == row.id)
+            .values(connector_specific_config=new_config)
+        )
+
+
+def downgrade() -> None:
+    connector = table(
+        "connector",
+        column("id", sa.Integer),
+        column("source", sa.String()),
+        column("input_type", sa.String()),
+        column("connector_specific_config", postgresql.JSONB),
+    )
+
+    confluence_connectors = (
+        op.get_bind()
+        .execute(
+            sa.select(connector).where(
+                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
+            )
+        )
+        .fetchall()
+    )
+
+    for row in confluence_connectors:
+        config = row.connector_specific_config
+        if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
+            wiki_page_url = reconstruct_confluence_url(
+                config["wiki_base"],
+                config["space"],
+                config.get("page_id", ""),
+                config["is_cloud"],
+            )
+
+            new_config = {"wiki_page_url": wiki_page_url}
+            new_config.update(
+                {
+                    k: v
+                    for k, v in config.items()
+                    if k not in ["wiki_base", "space", "page_id", "is_cloud"]
+                }
+            )
+
+            op.execute(
+                connector.update()
+                .where(connector.c.id == row.id)
+                .values(connector_specific_config=new_config)
+            )
diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py
@@ -7,7 +7,6 @@
 from functools import lru_cache
 from typing import Any
 from typing import cast
-from urllib.parse import urlparse
 
 import bs4
 from atlassian import Confluence  # type:ignore
@@ -53,79 +52,6 @@
 )
 
 
-def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
-    """Sample
-    URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
-    URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview
-
-    wiki_base is https://danswer.atlassian.net/wiki
-    space is 1234abcd
-    page_id is 5678efgh
-    """
-    parsed_url = urlparse(wiki_url)
-    wiki_base = (
-        parsed_url.scheme
-        + "://"
-        + parsed_url.netloc
-        + parsed_url.path.split("/spaces")[0]
-    )
-
-    path_parts = parsed_url.path.split("/")
-    space = path_parts[3]
-
-    page_id = path_parts[5] if len(path_parts) > 5 else ""
-    return wiki_base, space, page_id
-
-
-def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
-    """Sample
-    URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
-    URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
-    wiki_base is https://danswer.ai/confluence
-    space is 1234abcd
-    page_id is 5678efgh
-    """
-    # /display/ is always right before the space and at the end of the base print()
-    DISPLAY = "/display/"
-    PAGE = "/pages/"
-
-    parsed_url = urlparse(wiki_url)
-    wiki_base = (
-        parsed_url.scheme
-        + "://"
-        + parsed_url.netloc
-        + parsed_url.path.split(DISPLAY)[0]
-    )
-    space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
-    page_id = ""
-    if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
-        page_id = content[1]
-    return wiki_base, space, page_id
-
-
-def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
-    is_confluence_cloud = (
-        ".atlassian.net/wiki/spaces/" in wiki_url
-        or ".jira.com/wiki/spaces/" in wiki_url
-    )
-
-    try:
-        if is_confluence_cloud:
-            wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
-                wiki_url
-            )
-        else:
-            wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
-                wiki_url
-            )
-    except Exception as e:
-        error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
-        logger.error(error_msg)
-        raise ValueError(error_msg)
-
-    return wiki_base, space, page_id, is_confluence_cloud
-
-
 @lru_cache()
 def _get_user(user_id: str, confluence_client: Confluence) -> str:
     """Get Confluence Display Name based on the account-id or userkey value
@@ -372,7 +298,10 @@ def _fetch_single_depth_child_pages(
 class ConfluenceConnector(LoadConnector, PollConnector):
     def __init__(
         self,
-        wiki_page_url: str,
+        wiki_base: str,
+        space: str,
+        is_cloud: bool,
+        page_id: str = "",
         index_recursively: bool = True,
         batch_size: int = INDEX_BATCH_SIZE,
         continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
@@ -386,15 +315,15 @@ def __init__(
         self.labels_to_skip = set(labels_to_skip)
         self.recursive_indexer: RecursiveIndexer | None = None
         self.index_recursively = index_recursively
-        (
-            self.wiki_base,
-            self.space,
-            self.page_id,
-            self.is_cloud,
-        ) = extract_confluence_keys_from_url(wiki_page_url)
 
-        self.space_level_scan = False
+        # Remove trailing slash from wiki_base if present
+        self.wiki_base = wiki_base.rstrip("/")
+        self.space = space
+        self.page_id = page_id
 
+        self.is_cloud = is_cloud
+
+        self.space_level_scan = False
         self.confluence_client: Confluence | None = None
 
         if self.page_id is None or self.page_id == "":
@@ -414,7 +343,6 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
             username=username if self.is_cloud else None,
             password=access_token if self.is_cloud else None,
             token=access_token if not self.is_cloud else None,
-            cloud=self.is_cloud,
         )
         return None
 
@@ -866,7 +794,13 @@ def poll_source(
 
 
 if __name__ == "__main__":
-    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
+    connector = ConfluenceConnector(
+        wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
+        space=os.environ["CONFLUENCE_TEST_SPACE"],
+        is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
+        page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
+        index_recursively=True,
+    )
     connector.load_credentials(
         {
             "confluence_username": os.environ["CONFLUENCE_USER_NAME"],

diff --git a/backend/tests/daily/connectors/confluence/test_confluence_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_basic.py
@@ -8,7 +8,13 @@
 
 @pytest.fixture
 def confluence_connector() -> ConfluenceConnector:
-    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
+    connector = ConfluenceConnector(
+        wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
+        space=os.environ["CONFLUENCE_TEST_SPACE"],
+        is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
+        page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
+    )
+
     connector.load_credentials(
         {
             "confluence_username": os.environ["CONFLUENCE_USER_NAME"],

diff --git a/web/src/components/admin/connectors/ConnectorTitle.tsx b/web/src/components/admin/connectors/ConnectorTitle.tsx
@@ -48,10 +48,16 @@ export const ConnectorTitle = ({
     );
   } else if (connector.source === "confluence") {
     const typedConnector = connector as Connector<ConfluenceConfig>;
-    additionalMetadata.set(
-      "Wiki URL",
-      typedConnector.connector_specific_config.wiki_page_url
-    );
+    const wikiUrl = typedConnector.connector_specific_config.is_cloud
+      ? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}`
+      : `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`;
+    additionalMetadata.set("Wiki URL", wikiUrl);
+    if (typedConnector.connector_specific_config.page_id) {
+      additionalMetadata.set(
+        "Page ID",
+        typedConnector.connector_specific_config.page_id
+      );
+    }
   } else if (connector.source === "jira") {
     const typedConnector = connector as Connector<JiraConfig>;
     additionalMetadata.set(