Skip to content
This repository has been archived by the owner on Oct 26, 2023. It is now read-only.

feat/audit large repos #92

Merged
merged 1 commit into from
Apr 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ optional arguments:
--dry-run Simulate processing of the script without making changes to Snyk
--skip-scm-validation
Skip validation of the TLS certificate used by the SCM
--audit-large-repos only query github tree api to see if the response is truncated and
log the result. These are the repos that would have be cloned via this tool
--debug Write detailed debug data to snyk_scm_refresh.log for troubleshooting
```

Expand Down Expand Up @@ -136,3 +138,10 @@ Large repo detected, falling back to cloning. This may take a few minutes ...
![image](https://user-images.githubusercontent.com/59706011/163878251-e874b073-eab6-48c0-9bd3-ea995005e4a9.png)

The truncated GIT tree response is described [here](https://docs.github.com/en/rest/reference/git#get-a-tree). The last [known limits](https://github.community/t/github-get-tree-api-limits-and-recursivity/1300/2) are: 100,000 files or 7 MB of response data, whichever is first.

### Auditing which repos are considered large
In order to detect which repositories in snyk are subject the tree truncation issue mentioned above, there is another available option `--audit-large-repos`.
This will only query the git tree via API and look for a truncated response, and then log the results to a file `snyk-scm-refresh_large-repos-audit-results.csv`

To find all the repos based on a Snyk org, use the `--org-id` parameter in conjunction with `--audit-large-repos`
Optionally you can also supply a repo name to check a single repo by also supplying the `--repo-name` filter.
20 changes: 18 additions & 2 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
from app.models import ImportStatus
from app.gh_repo import (
get_gh_repo_status,
is_default_branch_renamed
is_default_branch_renamed,
is_gh_repo_truncated,
get_git_tree_from_api
)
from app.utils.snyk_helper import (
get_snyk_repos_from_snyk_orgs,
app_print,
process_import_status_checks,
import_manifests,
log_potential_delete
log_potential_delete,
log_audit_large_repo_result
)

def run():
Expand Down Expand Up @@ -75,6 +78,19 @@ def run():
log_potential_delete(snyk_repo.org_name, snyk_repo.full_name)

elif gh_repo_status.response_code == 200: # project exists and has not been renamed
# if --audit-large-repos is on
if common.ARGS.audit_large_repos:
is_truncated_str = \
is_gh_repo_truncated(
get_git_tree_from_api(snyk_repo.full_name, snyk_repo.origin)
)
log_audit_large_repo_result(
snyk_repo.org_name,
snyk_repo.full_name,
str(bool(is_truncated_str))
)
# move to next repo without processing the rest of the code
continue
# snyk has the wrong branch, re-import
if gh_repo_status.repo_default_branch != snyk_repo.branch:
app_print(snyk_repo.org_name,
Expand Down
62 changes: 42 additions & 20 deletions app/gh_repo.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""utilities for github"""
import logging
import re
import sys
import subprocess
import requests
from app.models import GithubRepoStatus
from app.utils.github_utils import (
get_github_client,
get_github_repo
)
import common


Expand All @@ -19,25 +24,38 @@
"manifests": []
}

def get_git_tree_from_clone(gh_repo):
def get_git_tree_from_clone(repo_name, origin):
"""
get git tree for large repos by performing
a shallow clone 'git clone --depth 1'
"""

tree_full_paths = []

gh_client = get_github_client(origin)
gh_repo = get_github_repo(gh_client, repo_name)

# check if git exists on the system
subprocess.run(["command", "-v", "git"], check=True, stdout=subprocess.DEVNULL)

name = gh_repo.name
clone_url = gh_repo.clone_url
default_branch = gh_repo.default_branch

print(f" - shallow cloning {name} from {clone_url} to /tmp")
GIT_CLONE_PATH = f"{common.GIT_CLONE_TEMP_DIR}/{name}"

# check that GIT_CLONE_PATH is set safely for deletion
if re.match(f'{common.GIT_CLONE_TEMP_DIR}/.+', GIT_CLONE_PATH) and \
re.match(rf'\/.+\/.+', GIT_CLONE_PATH):
pass
else:
sys.exit(f"could not determine that the temp cloning directory"
f"{GIT_CLONE_PATH} was set properly, exiting...")

print(f" - shallow cloning {name} from {clone_url} to {GIT_CLONE_PATH}")

# clone the repo locally
subprocess.run(["rm", "-fr", f"{common.GIT_CLONE_TEMP_DIR}/{name}"], check=True)
subprocess.run(["rm", "-fr", f"{GIT_CLONE_PATH}"], check=True)
subprocess.run(
["git", "clone", "--depth", "1", clone_url],
check=True,
Expand All @@ -56,9 +74,12 @@ def get_git_tree_from_clone(gh_repo):
capture_output=True,
check=True,
text=True,
cwd=f"{common.GIT_CLONE_TEMP_DIR}/{name}"
cwd=f"{GIT_CLONE_PATH}"
)

print(f" - removing cloned files in /tmp...")
subprocess.run(["rm", "-fr", f"{GIT_CLONE_PATH}"], check=True)

git_tree_lines = git_tree.stdout.splitlines()
print(f" - found {len(git_tree_lines)} tree items ...")

Expand All @@ -71,6 +92,18 @@ def get_git_tree_from_clone(gh_repo):

return tree_full_paths

def is_gh_repo_truncated(gh_tree_response) -> bool:
""" check if repo is truncated """
#pylint: disable=protected-access
return gh_tree_response._rawData['truncated']

def get_git_tree_from_api(repo_name, origin):
""" get git tree for repo via API call """
gh_client = get_github_client(origin)
gh_repo = get_github_repo(gh_client, repo_name)

return gh_repo.get_git_tree(gh_repo.default_branch, True)

def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
"""retrieve list of all supported manifests in a given github repo"""

Expand All @@ -79,29 +112,18 @@ def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
return state['manifests']

state['manifests'] = []
try:
if origin == 'github':
gh_repo = common.gh_client.get_repo(snyk_repo_name)
elif origin == 'github-enterprise':
gh_repo = common.gh_enterprise_client.get_repo(snyk_repo_name)
# pylint: disable=bare-except
except:
if origin == 'github':
gh_repo = common.gh_client.get_user().get_repo(snyk_repo_name)
elif origin == 'github-enterprise':
gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name)

tree_response = gh_repo.get_git_tree(gh_repo.default_branch, True)

tree_response = get_git_tree_from_api(snyk_repo_name, origin)

contents = tree_response.tree

#pylint: disable=protected-access
is_truncated_str = tree_response._rawData['truncated']
is_truncated_str = is_gh_repo_truncated(tree_response)

if is_truncated_str:
# repo too large to get try via API, just clone it
print(f" - Large repo detected, falling back to cloning. "
f"This may take a few minutes ...")
contents = get_git_tree_from_clone(gh_repo)
contents = get_git_tree_from_clone(snyk_repo_name, origin)
# print(f"tree contents: {contents}")

while contents:
Expand Down
20 changes: 20 additions & 0 deletions app/utils/github_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
github enterprise clients
"""
from github import Github
import common

# pylint: disable=invalid-name
def create_github_client(GITHUB_TOKEN, VERIFY_TLS):
Expand All @@ -23,3 +24,22 @@ def create_github_enterprise_client(GITHUB_ENTERPRISE_TOKEN, GITHUB_ENTERPRISE_H
raise RuntimeError(
"Failed to initialize GitHub client because GITHUB_ENTERPRISE_TOKEN is not set!"
) from err

def get_github_client(origin):
""" get the right github client depending on intergration type """
#pylint: disable=no-else-return
if origin == 'github':
return common.gh_client
elif origin == 'github-enterprise':
return common.gh_enterprise_client
else:
raise Exception(f"could not get github client for type: {origin}")

def get_github_repo(gh_client, repo_name):
""" get a github repo by name """
try:
return gh_client.get_repo(repo_name)
# pylint: disable=bare-except
except:
return gh_client.get_user().get_repo(repo_name)

7 changes: 7 additions & 0 deletions app/utils/snyk_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ def log_update_project_branch_error(org_name, project_id, project_name, new_bran
f"{project_id},"
f"{new_branch}\n")

def log_audit_large_repo_result(org_name: str, repo_name: str, is_large: str):
""" Log audit large repo result """
common.LARGE_REPOS_AUDIT_RESULTS_FILE.write(
f"{org_name},"
f"{repo_name},"
f"{is_large}\n")

def get_snyk_repos_from_snyk_orgs(snyk_orgs, ARGS):
"""Build list of repositories from a given list of Snyk orgs"""
snyk_repos = []
Expand Down
11 changes: 11 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
"%s_update-project-branches-errors.csv" % LOG_PREFIX, "w"
)
UPDATE_PROJECT_BRANCHES_ERRORS_FILE.write("org,project_name,project_id,new_branch\n")
LARGE_REPOS_AUDIT_RESULTS_FILE = open(
"%s_large-repos-audit-results.csv" % LOG_PREFIX, "w"
)
LARGE_REPOS_AUDIT_RESULTS_FILE.write("org,repo,is_large\n")

PENDING_REMOVAL_MAX_CHECKS = 45
PENDING_REMOVAL_CHECK_INTERVAL = 20
Expand Down Expand Up @@ -123,6 +127,13 @@ def parse_command_line_args():
required=False,
action="store_true",
)
parser.add_argument(
"--audit-large-repos",
help="only query github tree api to see if the response is truncated and \
log the result. These are the repos that would have be cloned via this tool",
required=False,
action="store_true",
)
parser.add_argument(
"--debug",
help="Write detailed debug data to snyk_scm_refresh.log for troubleshooting",
Expand Down
5 changes: 4 additions & 1 deletion snyk_scm_refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

if __name__ == "__main__":

if common.ARGS.dry_run:
if common.ARGS.audit_large_repos:
print("\n****** AUDIT LARGE REPOS MODE ******\n")
print(f"check {common.LARGE_REPOS_AUDIT_RESULTS_FILE.name} after script completes\n")
elif common.ARGS.dry_run:
print("\n****** DRY-RUN MODE ******\n")
for arg in vars(common.ARGS):
if any(arg in x for x in ['sca', 'container', 'iac', 'code']):
Expand Down