Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest): support git clone of non-github repos #7065

Merged
merged 3 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 71 additions & 17 deletions metadata-ingestion/src/datahub/configuration/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,78 @@

from pydantic import Field, FilePath, SecretStr, validator

from datahub.configuration.common import ConfigModel, ConfigurationError
from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field

_GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/"

_GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}"
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}"


class GitHubReference(ConfigModel):
"""Reference to a hosted Git repository. Used to generate "view source" links."""

repo: str = Field(
description="Name of your github repository in org/repo format. e.g. repo for https://github.com/datahub-project/datahub is `datahub-project/datahub`."
description="Name of your Git repo e.g. https://github.com/datahub-project/datahub or https://gitlab.com/gitlab-org/gitlab. If organization/repo is provided, we assume it is a GitHub repo."
)
branch: str = Field(
"main",
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
)
base_url: str = Field(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we safe to remove this? Do we want to have any mapping?

"https://github.com",
description="Base url for Github. Used to construct clickable links on the UI.",

url_template: Optional[str] = Field(
None,
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
"It supports the following variables: {repo_url}, {branch}, {file_path}",
)

@validator("repo")
def repo_should_be_org_slash_repo(cls, repo: str) -> str:
if "/" not in repo or len(repo.split("/")) != 2:
raise ConfigurationError(
"github repo should be in organization/repo form e.g. datahub-project/datahub"
)
_deprecated_base_url = pydantic_renamed_field(
"base_url",
"url_template",
transform=lambda url: _GITHUB_URL_TEMPLATE,
)

@validator("repo", pre=True)
def simplify_repo_url(cls, repo: str) -> str:
if repo.startswith("github.com/"):
repo = f"https://{repo}"
elif repo.startswith("gitlab.com"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want any trailing slash like github case

repo = f"https://{repo}"
elif repo.count("/") == 1:
repo = f"https://github.com/{repo}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

defaults to github?


if repo.endswith("/"):
repo = repo[:-1]

return repo

@validator("url_template", always=True)
def infer_url_template(cls, url_template: Optional[str], values: dict) -> str:
if url_template is not None:
return url_template

repo: str = values["repo"]
if repo.startswith(_GITHUB_PREFIX):
return _GITHUB_URL_TEMPLATE
elif repo.startswith(_GITLAB_PREFIX):
return _GITLAB_URL_TEMPLATE
else:
raise ValueError(
"Unable to infer URL template from repo. Please set url_template manually."
)

def get_url_for_file_path(self, file_path: str) -> str:
return f"{self.base_url}/{self.repo}/blob/{self.branch}/{file_path}"
assert self.url_template
return self.url_template.format(
repo_url=self.repo, branch=self.branch, file_path=file_path
)


class GitHubInfo(GitHubReference):
"""A reference to a Git repository, including a deploy key that can be used to clone it."""

deploy_key_file: Optional[FilePath] = Field(
None,
description="A private key file that contains an ssh key that has been configured as a deploy key for this repository. Use a file where possible, else see deploy_key for a config field that accepts a raw string.",
Expand All @@ -43,7 +86,7 @@ class GitHubInfo(GitHubReference):

repo_ssh_locator: Optional[str] = Field(
None,
description="Auto-inferred from repo as git@github.com:{repo}, but you can override this if needed.",
description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.",
)

@validator("deploy_key_file")
Expand All @@ -69,10 +112,21 @@ def deploy_key_filled_from_deploy_key_file(
return v

@validator("repo_ssh_locator", always=True)
def auto_infer_from_repo(cls, v: Optional[str], values: Dict[str, Any]) -> str:
if v is None:
return f"[email protected]:{values.get('repo')}"
return v
def infer_repo_ssh_locator(
cls, repo_ssh_locator: Optional[str], values: dict
) -> str:
if repo_ssh_locator is not None:
return repo_ssh_locator

repo: str = values["repo"]
if repo.startswith(_GITHUB_PREFIX):
return f"[email protected]:{repo[len(_GITHUB_PREFIX):]}.git"
elif repo.startswith(_GITLAB_PREFIX):
return f"[email protected]:{repo[len(_GITLAB_PREFIX):]}.git"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice - this is really awesome!! I was hoping we'd be able to generalize this

else:
raise ValueError(
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
)

@property
def branch_for_clone(self) -> Optional[str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1525,7 +1525,7 @@ def _recursively_check_manifests(
repo = p_cloner.get_last_repo_cloned()
assert repo
remote_github_info = GitHubInfo(
base_url=remote_project.url,
url_template=remote_project.url,
repo="dummy/dummy", # set to dummy values to bypass validation
branch=repo.active_branch.name,
)
Expand Down
108 changes: 92 additions & 16 deletions metadata-ingestion/tests/integration/git/test_git_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,106 @@
import pytest
from pydantic import SecretStr

from datahub.configuration.github import GitHubInfo
from datahub.configuration.common import ConfigurationWarning
from datahub.configuration.github import GitHubInfo, GitHubReference
from datahub.ingestion.source.git.git_import import GitClone

LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")


def test_base_url_guessing():
# Basic GitHub repo.
config = GitHubInfo(
repo="https://github.com/datahub-project/datahub", branch="master"
)
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git"

# Defaults to GitHub.
config = GitHubInfo(repo="datahub-project/datahub", branch="master")
assert (
config.get_url_for_file_path("docker/README.md")
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md"
)
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git"

# GitLab repo (notice the trailing slash).
config_ref = GitHubReference(
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master"
)
assert (
config_ref.get_url_for_file_path("hello_world.md")
== "https://gitlab.com/gitlab-tests/sample-project/-/blob/master/hello_world.md"
)

# Three-tier GitLab repo.
config = GitHubInfo(
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master"
)
assert (
config.get_url_for_file_path("onboarding/gitlab.nix")
== "https://gitlab.com/gitlab-com/gl-infra/reliability/-/blob/master/onboarding/gitlab.nix"
)
assert (
config.repo_ssh_locator == "[email protected]:gitlab-com/gl-infra/reliability.git"
)

# Overrides.
config = GitHubInfo(
repo="https://gitea.com/gitea/tea",
branch="main",
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}",
repo_ssh_locator="https://gitea.com/gitea/tea.git",
)
config.get_url_for_file_path(
"cmd/admin.go"
) == "https://gitea.com/gitea/tea/src/branch/main/cmd/admin.go"
config.repo_ssh_locator == "https://gitea.com/gitea/tea.git"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome tests!


# Deprecated: base_url.
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"):
config = GitHubInfo.parse_obj(
dict(
repo="https://github.com/datahub-project/datahub",
branch="master",
base_url="http://mygithubmirror.local",
)
)


def test_github_branch():
config = GitHubInfo(
repo="owner/repo",
)
assert config.branch_for_clone is None

config = GitHubInfo(
repo="owner/repo",
branch="main",
)
assert config.branch_for_clone == "main"


def test_git_clone_public(tmp_path):
git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone(
ssh_key=None,
repo_url="https://gitlab.com/gitlab-tests/sample-project",
branch="90c439634077a85bcf42d38c2c79cd94664a94ad",
)
assert checkout_dir.exists()
assert set(os.listdir(checkout_dir)) == {
".git",
"README.md",
"hello_world.md",
"fork-sample-project.png",
}


@pytest.mark.skipif(
LOOKML_TEST_SSH_KEY is None,
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured",
)
def test_git_clone(tmp_path):
def test_git_clone_private(tmp_path):
git_clone = GitClone(str(tmp_path))
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None

Expand All @@ -22,7 +111,7 @@ def test_git_clone(tmp_path):
repo_url="[email protected]:acryldata/long-tail-companions-looker",
branch="d380a2b777ec6f4653626f39c68dba85893faa74",
)
assert os.path.exists(checkout_dir)
assert checkout_dir.exists()
assert set(os.listdir(checkout_dir)) == set(
[
".datahub",
Expand All @@ -35,16 +124,3 @@ def test_git_clone(tmp_path):
"manifest.lkml",
]
)


def test_github_branch():
config = GitHubInfo(
repo="owner/repo",
)
assert config.branch_for_clone is None

config = GitHubInfo(
repo="owner/repo",
branch="main",
)
assert config.branch_for_clone == "main"