-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(ingest): support git clone of non-github repos #7065
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,35 +3,78 @@ | |
|
||
from pydantic import Field, FilePath, SecretStr, validator | ||
|
||
from datahub.configuration.common import ConfigModel, ConfigurationError | ||
from datahub.configuration.common import ConfigModel | ||
from datahub.configuration.validate_field_rename import pydantic_renamed_field | ||
|
||
_GITHUB_PREFIX = "https://github.com/" | ||
_GITLAB_PREFIX = "https://gitlab.com/" | ||
|
||
_GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}" | ||
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}" | ||
|
||
|
||
class GitHubReference(ConfigModel): | ||
"""Reference to a hosted Git repository. Used to generate "view source" links.""" | ||
|
||
repo: str = Field( | ||
description="Name of your github repository in org/repo format. e.g. repo for https://github.com/datahub-project/datahub is `datahub-project/datahub`." | ||
description="Name of your Git repo e.g. https://github.com/datahub-project/datahub or https://gitlab.com/gitlab-org/gitlab. If organization/repo is provided, we assume it is a GitHub repo." | ||
) | ||
branch: str = Field( | ||
"main", | ||
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.", | ||
) | ||
base_url: str = Field( | ||
"https://github.com", | ||
description="Base url for Github. Used to construct clickable links on the UI.", | ||
|
||
url_template: Optional[str] = Field( | ||
None, | ||
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required." | ||
"It supports the following variables: {repo_url}, {branch}, {file_path}", | ||
) | ||
|
||
@validator("repo") | ||
def repo_should_be_org_slash_repo(cls, repo: str) -> str: | ||
if "/" not in repo or len(repo.split("/")) != 2: | ||
raise ConfigurationError( | ||
"github repo should be in organization/repo form e.g. datahub-project/datahub" | ||
) | ||
_deprecated_base_url = pydantic_renamed_field( | ||
"base_url", | ||
"url_template", | ||
transform=lambda url: _GITHUB_URL_TEMPLATE, | ||
) | ||
|
||
@validator("repo", pre=True) | ||
def simplify_repo_url(cls, repo: str) -> str: | ||
if repo.startswith("github.com/"): | ||
repo = f"https://{repo}" | ||
elif repo.startswith("gitlab.com"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we want any trailing slash like github case |
||
repo = f"https://{repo}" | ||
elif repo.count("/") == 1: | ||
repo = f"https://github.com/{repo}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. defaults to github? |
||
|
||
if repo.endswith("/"): | ||
repo = repo[:-1] | ||
|
||
return repo | ||
|
||
@validator("url_template", always=True) | ||
def infer_url_template(cls, url_template: Optional[str], values: dict) -> str: | ||
if url_template is not None: | ||
return url_template | ||
|
||
repo: str = values["repo"] | ||
if repo.startswith(_GITHUB_PREFIX): | ||
return _GITHUB_URL_TEMPLATE | ||
elif repo.startswith(_GITLAB_PREFIX): | ||
return _GITLAB_URL_TEMPLATE | ||
else: | ||
raise ValueError( | ||
"Unable to infer URL template from repo. Please set url_template manually." | ||
) | ||
|
||
def get_url_for_file_path(self, file_path: str) -> str: | ||
return f"{self.base_url}/{self.repo}/blob/{self.branch}/{file_path}" | ||
assert self.url_template | ||
return self.url_template.format( | ||
repo_url=self.repo, branch=self.branch, file_path=file_path | ||
) | ||
|
||
|
||
class GitHubInfo(GitHubReference): | ||
"""A reference to a Git repository, including a deploy key that can be used to clone it.""" | ||
|
||
deploy_key_file: Optional[FilePath] = Field( | ||
None, | ||
description="A private key file that contains an ssh key that has been configured as a deploy key for this repository. Use a file where possible, else see deploy_key for a config field that accepts a raw string.", | ||
|
@@ -43,7 +86,7 @@ class GitHubInfo(GitHubReference): | |
|
||
repo_ssh_locator: Optional[str] = Field( | ||
None, | ||
description="Auto-inferred from repo as git@github.com:{repo}, but you can override this if needed.", | ||
description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.", | ||
) | ||
|
||
@validator("deploy_key_file") | ||
|
@@ -69,10 +112,21 @@ def deploy_key_filled_from_deploy_key_file( | |
return v | ||
|
||
@validator("repo_ssh_locator", always=True) | ||
def auto_infer_from_repo(cls, v: Optional[str], values: Dict[str, Any]) -> str: | ||
if v is None: | ||
return f"[email protected]:{values.get('repo')}" | ||
return v | ||
def infer_repo_ssh_locator( | ||
cls, repo_ssh_locator: Optional[str], values: dict | ||
) -> str: | ||
if repo_ssh_locator is not None: | ||
return repo_ssh_locator | ||
|
||
repo: str = values["repo"] | ||
if repo.startswith(_GITHUB_PREFIX): | ||
return f"[email protected]:{repo[len(_GITHUB_PREFIX):]}.git" | ||
elif repo.startswith(_GITLAB_PREFIX): | ||
return f"[email protected]:{repo[len(_GITLAB_PREFIX):]}.git" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice - this is really awesome!! I was hoping we'd be able to generalize this |
||
else: | ||
raise ValueError( | ||
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually." | ||
) | ||
|
||
@property | ||
def branch_for_clone(self) -> Optional[str]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,17 +3,106 @@ | |
import pytest | ||
from pydantic import SecretStr | ||
|
||
from datahub.configuration.github import GitHubInfo | ||
from datahub.configuration.common import ConfigurationWarning | ||
from datahub.configuration.github import GitHubInfo, GitHubReference | ||
from datahub.ingestion.source.git.git_import import GitClone | ||
|
||
LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY") | ||
|
||
|
||
def test_base_url_guessing(): | ||
# Basic GitHub repo. | ||
config = GitHubInfo( | ||
repo="https://github.com/datahub-project/datahub", branch="master" | ||
) | ||
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git" | ||
|
||
# Defaults to GitHub. | ||
config = GitHubInfo(repo="datahub-project/datahub", branch="master") | ||
assert ( | ||
config.get_url_for_file_path("docker/README.md") | ||
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md" | ||
) | ||
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git" | ||
|
||
# GitLab repo (notice the trailing slash). | ||
config_ref = GitHubReference( | ||
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master" | ||
) | ||
assert ( | ||
config_ref.get_url_for_file_path("hello_world.md") | ||
== "https://gitlab.com/gitlab-tests/sample-project/-/blob/master/hello_world.md" | ||
) | ||
|
||
# Three-tier GitLab repo. | ||
config = GitHubInfo( | ||
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master" | ||
) | ||
assert ( | ||
config.get_url_for_file_path("onboarding/gitlab.nix") | ||
== "https://gitlab.com/gitlab-com/gl-infra/reliability/-/blob/master/onboarding/gitlab.nix" | ||
) | ||
assert ( | ||
config.repo_ssh_locator == "[email protected]:gitlab-com/gl-infra/reliability.git" | ||
) | ||
|
||
# Overrides. | ||
config = GitHubInfo( | ||
repo="https://gitea.com/gitea/tea", | ||
branch="main", | ||
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}", | ||
repo_ssh_locator="https://gitea.com/gitea/tea.git", | ||
) | ||
config.get_url_for_file_path( | ||
"cmd/admin.go" | ||
) == "https://gitea.com/gitea/tea/src/branch/main/cmd/admin.go" | ||
config.repo_ssh_locator == "https://gitea.com/gitea/tea.git" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Awesome tests! |
||
|
||
# Deprecated: base_url. | ||
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"): | ||
config = GitHubInfo.parse_obj( | ||
dict( | ||
repo="https://github.com/datahub-project/datahub", | ||
branch="master", | ||
base_url="http://mygithubmirror.local", | ||
) | ||
) | ||
|
||
|
||
def test_github_branch(): | ||
config = GitHubInfo( | ||
repo="owner/repo", | ||
) | ||
assert config.branch_for_clone is None | ||
|
||
config = GitHubInfo( | ||
repo="owner/repo", | ||
branch="main", | ||
) | ||
assert config.branch_for_clone == "main" | ||
|
||
|
||
def test_git_clone_public(tmp_path): | ||
git_clone = GitClone(str(tmp_path)) | ||
checkout_dir = git_clone.clone( | ||
ssh_key=None, | ||
repo_url="https://gitlab.com/gitlab-tests/sample-project", | ||
branch="90c439634077a85bcf42d38c2c79cd94664a94ad", | ||
) | ||
assert checkout_dir.exists() | ||
assert set(os.listdir(checkout_dir)) == { | ||
".git", | ||
"README.md", | ||
"hello_world.md", | ||
"fork-sample-project.png", | ||
} | ||
|
||
|
||
@pytest.mark.skipif( | ||
LOOKML_TEST_SSH_KEY is None, | ||
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured", | ||
) | ||
def test_git_clone(tmp_path): | ||
def test_git_clone_private(tmp_path): | ||
git_clone = GitClone(str(tmp_path)) | ||
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None | ||
|
||
|
@@ -22,7 +111,7 @@ def test_git_clone(tmp_path): | |
repo_url="[email protected]:acryldata/long-tail-companions-looker", | ||
branch="d380a2b777ec6f4653626f39c68dba85893faa74", | ||
) | ||
assert os.path.exists(checkout_dir) | ||
assert checkout_dir.exists() | ||
assert set(os.listdir(checkout_dir)) == set( | ||
[ | ||
".datahub", | ||
|
@@ -35,16 +124,3 @@ def test_git_clone(tmp_path): | |
"manifest.lkml", | ||
] | ||
) | ||
|
||
|
||
def test_github_branch(): | ||
config = GitHubInfo( | ||
repo="owner/repo", | ||
) | ||
assert config.branch_for_clone is None | ||
|
||
config = GitHubInfo( | ||
repo="owner/repo", | ||
branch="main", | ||
) | ||
assert config.branch_for_clone == "main" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we safe to remove this? Do we want to have any mapping?