forked from datahub-project/datahub
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(ingest): support git clone of non-github repos (datahub-project#7065
- Loading branch information
Showing
3 changed files
with
164 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,35 +3,78 @@ | |
|
||
from pydantic import Field, FilePath, SecretStr, validator | ||
|
||
from datahub.configuration.common import ConfigModel, ConfigurationError | ||
from datahub.configuration.common import ConfigModel | ||
from datahub.configuration.validate_field_rename import pydantic_renamed_field | ||
|
||
_GITHUB_PREFIX = "https://github.com/" | ||
_GITLAB_PREFIX = "https://gitlab.com/" | ||
|
||
_GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}" | ||
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}" | ||
|
||
|
||
class GitHubReference(ConfigModel): | ||
"""Reference to a hosted Git repository. Used to generate "view source" links.""" | ||
|
||
repo: str = Field( | ||
description="Name of your github repository in org/repo format. e.g. repo for https://github.com/datahub-project/datahub is `datahub-project/datahub`." | ||
description="Name of your Git repo e.g. https://github.com/datahub-project/datahub or https://gitlab.com/gitlab-org/gitlab. If organization/repo is provided, we assume it is a GitHub repo." | ||
) | ||
branch: str = Field( | ||
"main", | ||
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.", | ||
) | ||
base_url: str = Field( | ||
"https://github.com", | ||
description="Base url for Github. Used to construct clickable links on the UI.", | ||
|
||
url_template: Optional[str] = Field( | ||
None, | ||
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required." | ||
"It supports the following variables: {repo_url}, {branch}, {file_path}", | ||
) | ||
|
||
@validator("repo") | ||
def repo_should_be_org_slash_repo(cls, repo: str) -> str: | ||
if "/" not in repo or len(repo.split("/")) != 2: | ||
raise ConfigurationError( | ||
"github repo should be in organization/repo form e.g. datahub-project/datahub" | ||
) | ||
_deprecated_base_url = pydantic_renamed_field( | ||
"base_url", | ||
"url_template", | ||
transform=lambda url: _GITHUB_URL_TEMPLATE, | ||
) | ||
|
||
@validator("repo", pre=True) | ||
def simplify_repo_url(cls, repo: str) -> str: | ||
if repo.startswith("github.com/"): | ||
repo = f"https://{repo}" | ||
elif repo.startswith("gitlab.com"): | ||
repo = f"https://{repo}" | ||
elif repo.count("/") == 1: | ||
repo = f"https://github.com/{repo}" | ||
|
||
if repo.endswith("/"): | ||
repo = repo[:-1] | ||
|
||
return repo | ||
|
||
@validator("url_template", always=True) | ||
def infer_url_template(cls, url_template: Optional[str], values: dict) -> str: | ||
if url_template is not None: | ||
return url_template | ||
|
||
repo: str = values["repo"] | ||
if repo.startswith(_GITHUB_PREFIX): | ||
return _GITHUB_URL_TEMPLATE | ||
elif repo.startswith(_GITLAB_PREFIX): | ||
return _GITLAB_URL_TEMPLATE | ||
else: | ||
raise ValueError( | ||
"Unable to infer URL template from repo. Please set url_template manually." | ||
) | ||
|
||
def get_url_for_file_path(self, file_path: str) -> str: | ||
return f"{self.base_url}/{self.repo}/blob/{self.branch}/{file_path}" | ||
assert self.url_template | ||
return self.url_template.format( | ||
repo_url=self.repo, branch=self.branch, file_path=file_path | ||
) | ||
|
||
|
||
class GitHubInfo(GitHubReference): | ||
"""A reference to a Git repository, including a deploy key that can be used to clone it.""" | ||
|
||
deploy_key_file: Optional[FilePath] = Field( | ||
None, | ||
description="A private key file that contains an ssh key that has been configured as a deploy key for this repository. Use a file where possible, else see deploy_key for a config field that accepts a raw string.", | ||
|
@@ -43,7 +86,7 @@ class GitHubInfo(GitHubReference): | |
|
||
repo_ssh_locator: Optional[str] = Field( | ||
None, | ||
description="Auto-inferred from repo as git@github.com:{repo}, but you can override this if needed.", | ||
description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.", | ||
) | ||
|
||
@validator("deploy_key_file") | ||
|
@@ -69,10 +112,21 @@ def deploy_key_filled_from_deploy_key_file( | |
return v | ||
|
||
@validator("repo_ssh_locator", always=True) | ||
def auto_infer_from_repo(cls, v: Optional[str], values: Dict[str, Any]) -> str: | ||
if v is None: | ||
return f"[email protected]:{values.get('repo')}" | ||
return v | ||
def infer_repo_ssh_locator( | ||
cls, repo_ssh_locator: Optional[str], values: dict | ||
) -> str: | ||
if repo_ssh_locator is not None: | ||
return repo_ssh_locator | ||
|
||
repo: str = values["repo"] | ||
if repo.startswith(_GITHUB_PREFIX): | ||
return f"[email protected]:{repo[len(_GITHUB_PREFIX):]}.git" | ||
elif repo.startswith(_GITLAB_PREFIX): | ||
return f"[email protected]:{repo[len(_GITLAB_PREFIX):]}.git" | ||
else: | ||
raise ValueError( | ||
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually." | ||
) | ||
|
||
@property | ||
def branch_for_clone(self) -> Optional[str]: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,17 +3,106 @@ | |
import pytest | ||
from pydantic import SecretStr | ||
|
||
from datahub.configuration.github import GitHubInfo | ||
from datahub.configuration.common import ConfigurationWarning | ||
from datahub.configuration.github import GitHubInfo, GitHubReference | ||
from datahub.ingestion.source.git.git_import import GitClone | ||
|
||
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") | ||
|
||
|
||
def test_base_url_guessing(): | ||
# Basic GitHub repo. | ||
config = GitHubInfo( | ||
repo="https://github.com/datahub-project/datahub", branch="master" | ||
) | ||
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git" | ||
|
||
# Defaults to GitHub. | ||
config = GitHubInfo(repo="datahub-project/datahub", branch="master") | ||
assert ( | ||
config.get_url_for_file_path("docker/README.md") | ||
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md" | ||
) | ||
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git" | ||
|
||
# GitLab repo (notice the trailing slash). | ||
config_ref = GitHubReference( | ||
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master" | ||
) | ||
assert ( | ||
config_ref.get_url_for_file_path("hello_world.md") | ||
== "https://gitlab.com/gitlab-tests/sample-project/-/blob/master/hello_world.md" | ||
) | ||
|
||
# Three-tier GitLab repo. | ||
config = GitHubInfo( | ||
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master" | ||
) | ||
assert ( | ||
config.get_url_for_file_path("onboarding/gitlab.nix") | ||
== "https://gitlab.com/gitlab-com/gl-infra/reliability/-/blob/master/onboarding/gitlab.nix" | ||
) | ||
assert ( | ||
config.repo_ssh_locator == "[email protected]:gitlab-com/gl-infra/reliability.git" | ||
) | ||
|
||
# Overrides. | ||
config = GitHubInfo( | ||
repo="https://gitea.com/gitea/tea", | ||
branch="main", | ||
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}", | ||
repo_ssh_locator="https://gitea.com/gitea/tea.git", | ||
) | ||
config.get_url_for_file_path( | ||
"cmd/admin.go" | ||
) == "https://gitea.com/gitea/tea/src/branch/main/cmd/admin.go" | ||
config.repo_ssh_locator == "https://gitea.com/gitea/tea.git" | ||
|
||
# Deprecated: base_url. | ||
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"): | ||
config = GitHubInfo.parse_obj( | ||
dict( | ||
repo="https://github.com/datahub-project/datahub", | ||
branch="master", | ||
base_url="http://mygithubmirror.local", | ||
) | ||
) | ||
|
||
|
||
def test_github_branch(): | ||
config = GitHubInfo( | ||
repo="owner/repo", | ||
) | ||
assert config.branch_for_clone is None | ||
|
||
config = GitHubInfo( | ||
repo="owner/repo", | ||
branch="main", | ||
) | ||
assert config.branch_for_clone == "main" | ||
|
||
|
||
def test_git_clone_public(tmp_path): | ||
git_clone = GitClone(str(tmp_path)) | ||
checkout_dir = git_clone.clone( | ||
ssh_key=None, | ||
repo_url="https://gitlab.com/gitlab-tests/sample-project", | ||
branch="90c439634077a85bcf42d38c2c79cd94664a94ad", | ||
) | ||
assert checkout_dir.exists() | ||
assert set(os.listdir(checkout_dir)) == { | ||
".git", | ||
"README.md", | ||
"hello_world.md", | ||
"fork-sample-project.png", | ||
} | ||
|
||
|
||
@pytest.mark.skipif( | ||
LOOKML_TEST_SSH_KEY is None, | ||
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured", | ||
) | ||
def test_git_clone(tmp_path): | ||
def test_git_clone_private(tmp_path): | ||
git_clone = GitClone(str(tmp_path)) | ||
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None | ||
|
||
|
@@ -22,7 +111,7 @@ def test_git_clone(tmp_path): | |
repo_url="[email protected]:acryldata/long-tail-companions-looker", | ||
branch="d380a2b777ec6f4653626f39c68dba85893faa74", | ||
) | ||
assert os.path.exists(checkout_dir) | ||
assert checkout_dir.exists() | ||
assert set(os.listdir(checkout_dir)) == set( | ||
[ | ||
".datahub", | ||
|
@@ -35,16 +124,3 @@ def test_git_clone(tmp_path): | |
"manifest.lkml", | ||
] | ||
) | ||
|
||
|
||
def test_github_branch(): | ||
config = GitHubInfo( | ||
repo="owner/repo", | ||
) | ||
assert config.branch_for_clone is None | ||
|
||
config = GitHubInfo( | ||
repo="owner/repo", | ||
branch="main", | ||
) | ||
assert config.branch_for_clone == "main" |