Skip to content

Commit

Permalink
fix(ingest): support git clone of non-github repos (datahub-project#7065
Browse files Browse the repository at this point in the history
)
  • Loading branch information
hsheth2 authored and Eric Yomi committed Feb 8, 2023
1 parent f5d32ac commit e74d8be
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 34 deletions.
88 changes: 71 additions & 17 deletions metadata-ingestion/src/datahub/configuration/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,78 @@

from pydantic import Field, FilePath, SecretStr, validator

from datahub.configuration.common import ConfigModel, ConfigurationError
from datahub.configuration.common import ConfigModel
from datahub.configuration.validate_field_rename import pydantic_renamed_field

_GITHUB_PREFIX = "https://github.com/"
_GITLAB_PREFIX = "https://gitlab.com/"

_GITHUB_URL_TEMPLATE = "{repo_url}/blob/{branch}/{file_path}"
_GITLAB_URL_TEMPLATE = "{repo_url}/-/blob/{branch}/{file_path}"


class GitHubReference(ConfigModel):
"""Reference to a hosted Git repository. Used to generate "view source" links."""

repo: str = Field(
description="Name of your github repository in org/repo format. e.g. repo for https://github.com/datahub-project/datahub is `datahub-project/datahub`."
description="Name of your Git repo e.g. https://github.com/datahub-project/datahub or https://gitlab.com/gitlab-org/gitlab. If organization/repo is provided, we assume it is a GitHub repo."
)
branch: str = Field(
"main",
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
)
base_url: str = Field(
"https://github.com",
description="Base url for Github. Used to construct clickable links on the UI.",

url_template: Optional[str] = Field(
None,
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
"It supports the following variables: {repo_url}, {branch}, {file_path}",
)

@validator("repo")
def repo_should_be_org_slash_repo(cls, repo: str) -> str:
if "/" not in repo or len(repo.split("/")) != 2:
raise ConfigurationError(
"github repo should be in organization/repo form e.g. datahub-project/datahub"
)
_deprecated_base_url = pydantic_renamed_field(
"base_url",
"url_template",
transform=lambda url: _GITHUB_URL_TEMPLATE,
)

@validator("repo", pre=True)
def simplify_repo_url(cls, repo: str) -> str:
if repo.startswith("github.com/"):
repo = f"https://{repo}"
elif repo.startswith("gitlab.com"):
repo = f"https://{repo}"
elif repo.count("/") == 1:
repo = f"https://github.com/{repo}"

if repo.endswith("/"):
repo = repo[:-1]

return repo

@validator("url_template", always=True)
def infer_url_template(cls, url_template: Optional[str], values: dict) -> str:
if url_template is not None:
return url_template

repo: str = values["repo"]
if repo.startswith(_GITHUB_PREFIX):
return _GITHUB_URL_TEMPLATE
elif repo.startswith(_GITLAB_PREFIX):
return _GITLAB_URL_TEMPLATE
else:
raise ValueError(
"Unable to infer URL template from repo. Please set url_template manually."
)

def get_url_for_file_path(self, file_path: str) -> str:
return f"{self.base_url}/{self.repo}/blob/{self.branch}/{file_path}"
assert self.url_template
return self.url_template.format(
repo_url=self.repo, branch=self.branch, file_path=file_path
)


class GitHubInfo(GitHubReference):
"""A reference to a Git repository, including a deploy key that can be used to clone it."""

deploy_key_file: Optional[FilePath] = Field(
None,
description="A private key file that contains an ssh key that has been configured as a deploy key for this repository. Use a file where possible, else see deploy_key for a config field that accepts a raw string.",
Expand All @@ -43,7 +86,7 @@ class GitHubInfo(GitHubReference):

repo_ssh_locator: Optional[str] = Field(
None,
description="Auto-inferred from repo as git@github.com:{repo}, but you can override this if needed.",
description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.",
)

@validator("deploy_key_file")
Expand All @@ -69,10 +112,21 @@ def deploy_key_filled_from_deploy_key_file(
return v

@validator("repo_ssh_locator", always=True)
def auto_infer_from_repo(cls, v: Optional[str], values: Dict[str, Any]) -> str:
if v is None:
return f"[email protected]:{values.get('repo')}"
return v
def infer_repo_ssh_locator(
cls, repo_ssh_locator: Optional[str], values: dict
) -> str:
if repo_ssh_locator is not None:
return repo_ssh_locator

repo: str = values["repo"]
if repo.startswith(_GITHUB_PREFIX):
return f"[email protected]:{repo[len(_GITHUB_PREFIX):]}.git"
elif repo.startswith(_GITLAB_PREFIX):
return f"[email protected]:{repo[len(_GITLAB_PREFIX):]}.git"
else:
raise ValueError(
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
)

@property
def branch_for_clone(self) -> Optional[str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1525,7 +1525,7 @@ def _recursively_check_manifests(
repo = p_cloner.get_last_repo_cloned()
assert repo
remote_github_info = GitHubInfo(
base_url=remote_project.url,
url_template=remote_project.url,
repo="dummy/dummy", # set to dummy values to bypass validation
branch=repo.active_branch.name,
)
Expand Down
108 changes: 92 additions & 16 deletions metadata-ingestion/tests/integration/git/test_git_clone.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,106 @@
import pytest
from pydantic import SecretStr

from datahub.configuration.github import GitHubInfo
from datahub.configuration.common import ConfigurationWarning
from datahub.configuration.github import GitHubInfo, GitHubReference
from datahub.ingestion.source.git.git_import import GitClone

LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")


def test_base_url_guessing():
# Basic GitHub repo.
config = GitHubInfo(
repo="https://github.com/datahub-project/datahub", branch="master"
)
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git"

# Defaults to GitHub.
config = GitHubInfo(repo="datahub-project/datahub", branch="master")
assert (
config.get_url_for_file_path("docker/README.md")
== "https://github.com/datahub-project/datahub/blob/master/docker/README.md"
)
assert config.repo_ssh_locator == "[email protected]:datahub-project/datahub.git"

# GitLab repo (notice the trailing slash).
config_ref = GitHubReference(
repo="https://gitlab.com/gitlab-tests/sample-project/", branch="master"
)
assert (
config_ref.get_url_for_file_path("hello_world.md")
== "https://gitlab.com/gitlab-tests/sample-project/-/blob/master/hello_world.md"
)

# Three-tier GitLab repo.
config = GitHubInfo(
repo="https://gitlab.com/gitlab-com/gl-infra/reliability", branch="master"
)
assert (
config.get_url_for_file_path("onboarding/gitlab.nix")
== "https://gitlab.com/gitlab-com/gl-infra/reliability/-/blob/master/onboarding/gitlab.nix"
)
assert (
config.repo_ssh_locator == "[email protected]:gitlab-com/gl-infra/reliability.git"
)

# Overrides.
config = GitHubInfo(
repo="https://gitea.com/gitea/tea",
branch="main",
url_template="https://gitea.com/gitea/tea/src/branch/{branch}/{file_path}",
repo_ssh_locator="https://gitea.com/gitea/tea.git",
)
config.get_url_for_file_path(
"cmd/admin.go"
) == "https://gitea.com/gitea/tea/src/branch/main/cmd/admin.go"
config.repo_ssh_locator == "https://gitea.com/gitea/tea.git"

# Deprecated: base_url.
with pytest.warns(ConfigurationWarning, match="base_url is deprecated"):
config = GitHubInfo.parse_obj(
dict(
repo="https://github.com/datahub-project/datahub",
branch="master",
base_url="http://mygithubmirror.local",
)
)


def test_github_branch():
config = GitHubInfo(
repo="owner/repo",
)
assert config.branch_for_clone is None

config = GitHubInfo(
repo="owner/repo",
branch="main",
)
assert config.branch_for_clone == "main"


def test_git_clone_public(tmp_path):
git_clone = GitClone(str(tmp_path))
checkout_dir = git_clone.clone(
ssh_key=None,
repo_url="https://gitlab.com/gitlab-tests/sample-project",
branch="90c439634077a85bcf42d38c2c79cd94664a94ad",
)
assert checkout_dir.exists()
assert set(os.listdir(checkout_dir)) == {
".git",
"README.md",
"hello_world.md",
"fork-sample-project.png",
}


@pytest.mark.skipif(
LOOKML_TEST_SSH_KEY is None,
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured",
)
def test_git_clone(tmp_path):
def test_git_clone_private(tmp_path):
git_clone = GitClone(str(tmp_path))
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None

Expand All @@ -22,7 +111,7 @@ def test_git_clone(tmp_path):
repo_url="[email protected]:acryldata/long-tail-companions-looker",
branch="d380a2b777ec6f4653626f39c68dba85893faa74",
)
assert os.path.exists(checkout_dir)
assert checkout_dir.exists()
assert set(os.listdir(checkout_dir)) == set(
[
".datahub",
Expand All @@ -35,16 +124,3 @@ def test_git_clone(tmp_path):
"manifest.lkml",
]
)


def test_github_branch():
config = GitHubInfo(
repo="owner/repo",
)
assert config.branch_for_clone is None

config = GitHubInfo(
repo="owner/repo",
branch="main",
)
assert config.branch_for_clone == "main"

0 comments on commit e74d8be

Please sign in to comment.