diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index d237cd9ddd306..e7e9bfd43adca 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -24,7 +24,11 @@ class GitReference(ConfigModel): "main", description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.", ) - + url_subdir: Optional[str] = Field( + default=None, + description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. " + "Only affects URL generation, not git operations.", + ) url_template: Optional[str] = Field( None, description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required." @@ -68,6 +72,8 @@ def infer_url_template(cls, url_template: Optional[str], values: dict) -> str: def get_url_for_file_path(self, file_path: str) -> str: assert self.url_template + if self.url_subdir: + file_path = f"{self.url_subdir}/{file_path}" return self.url_template.format( repo_url=self.repo, branch=self.branch, file_path=file_path ) diff --git a/metadata-ingestion/tests/integration/git/test_git_clone.py b/metadata-ingestion/tests/integration/git/test_git_clone.py index 60cf20fefcbdd..01e075930998a 100644 --- a/metadata-ingestion/tests/integration/git/test_git_clone.py +++ b/metadata-ingestion/tests/integration/git/test_git_clone.py @@ -1,4 +1,5 @@ import os +import pathlib import pytest from pydantic import SecretStr @@ -12,7 +13,7 @@ LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") -def test_base_url_guessing(): +def test_base_url_guessing() -> None: # Basic GitHub repo. config = GitInfo(repo="https://github.com/datahub-project/datahub", branch="master") assert config.repo_ssh_locator == "git@github.com:datahub-project/datahub.git" @@ -70,7 +71,7 @@ def test_base_url_guessing(): ) -def test_github_branch(): +def test_github_branch() -> None: config = GitInfo( repo="owner/repo", ) @@ -83,11 +84,37 @@ def test_github_branch(): assert config.branch_for_clone == "main" +def test_url_subdir() -> None: + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://gitlab.com/org/repo", url_subdir="dbt") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://gitlab.com/org/repo/-/blob/main/dbt/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/model.sql" + ) + + git_ref = GitReference(repo="https://github.com/org/repo", url_subdir="dbt/models") + assert ( + git_ref.get_url_for_file_path("model.sql") + == "https://github.com/org/repo/blob/main/dbt/models/model.sql" + ) + + def test_sanitize_repo_url() -> None: assert_doctest(datahub.ingestion.source.git.git_import) -def test_git_clone_public(tmp_path): +def test_git_clone_public(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) checkout_dir = git_clone.clone( ssh_key=None, @@ -107,7 +134,7 @@ def test_git_clone_public(tmp_path): LOOKML_TEST_SSH_KEY is None, reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured", ) -def test_git_clone_private(tmp_path): +def test_git_clone_private(tmp_path: pathlib.Path) -> None: git_clone = GitClone(str(tmp_path)) secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None