Skip to content
This repository has been archived by the owner on Oct 26, 2023. It is now read-only.

Commit

Permalink
Merge pull request #92 from snyk-tech-services/feat/audit-large-repos
Browse files Browse the repository at this point in the history
feat: audit large repos
  • Loading branch information
huytquach-snyk authored Apr 29, 2022
2 parents f22578b + 41f62d7 commit 5d5fde7
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 23 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ optional arguments:
--dry-run Simulate processing of the script without making changes to Snyk
--skip-scm-validation
Skip validation of the TLS certificate used by the SCM
--audit-large-repos only query github tree api to see if the response is truncated and
log the result. These are the repos that would have be cloned via this tool
--debug Write detailed debug data to snyk_scm_refresh.log for troubleshooting
```

Expand Down Expand Up @@ -136,3 +138,10 @@ Large repo detected, falling back to cloning. This may take a few minutes ...
![image](https://user-images.githubusercontent.com/59706011/163878251-e874b073-eab6-48c0-9bd3-ea995005e4a9.png)

The truncated GIT tree response is described [here](https://docs.github.com/en/rest/reference/git#get-a-tree). The last [known limits](https://github.community/t/github-get-tree-api-limits-and-recursivity/1300/2) are: 100,000 files or 7 MB of response data, whichever is first.

### Auditing which repos are considered large
In order to detect which repositories in snyk are subject the tree truncation issue mentioned above, there is another available option `--audit-large-repos`.
This will only query the git tree via API and look for a truncated response, and then log the results to a file `snyk-scm-refresh_large-repos-audit-results.csv`

To find all the repos based on a Snyk org, use the `--org-id` parameter in conjunction with `--audit-large-repos`
Optionally you can also supply a repo name to check a single repo by also supplying the `--repo-name` filter.
20 changes: 18 additions & 2 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,17 @@
from app.models import ImportStatus
from app.gh_repo import (
get_gh_repo_status,
is_default_branch_renamed
is_default_branch_renamed,
is_gh_repo_truncated,
get_git_tree_from_api
)
from app.utils.snyk_helper import (
get_snyk_repos_from_snyk_orgs,
app_print,
process_import_status_checks,
import_manifests,
log_potential_delete
log_potential_delete,
log_audit_large_repo_result
)

def run():
Expand Down Expand Up @@ -75,6 +78,19 @@ def run():
log_potential_delete(snyk_repo.org_name, snyk_repo.full_name)

elif gh_repo_status.response_code == 200: # project exists and has not been renamed
# if --audit-large-repos is on
if common.ARGS.audit_large_repos:
is_truncated_str = \
is_gh_repo_truncated(
get_git_tree_from_api(snyk_repo.full_name, snyk_repo.origin)
)
log_audit_large_repo_result(
snyk_repo.org_name,
snyk_repo.full_name,
str(bool(is_truncated_str))
)
# move to next repo without processing the rest of the code
continue
# snyk has the wrong branch, re-import
if gh_repo_status.repo_default_branch != snyk_repo.branch:
app_print(snyk_repo.org_name,
Expand Down
62 changes: 42 additions & 20 deletions app/gh_repo.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""utilities for github"""
import logging
import re
import sys
import subprocess
import requests
from app.models import GithubRepoStatus
from app.utils.github_utils import (
get_github_client,
get_github_repo
)
import common


Expand All @@ -19,25 +24,38 @@
"manifests": []
}

def get_git_tree_from_clone(gh_repo):
def get_git_tree_from_clone(repo_name, origin):
"""
get git tree for large repos by performing
a shallow clone 'git clone --depth 1'
"""

tree_full_paths = []

gh_client = get_github_client(origin)
gh_repo = get_github_repo(gh_client, repo_name)

# check if git exists on the system
subprocess.run(["command", "-v", "git"], check=True, stdout=subprocess.DEVNULL)

name = gh_repo.name
clone_url = gh_repo.clone_url
default_branch = gh_repo.default_branch

print(f" - shallow cloning {name} from {clone_url} to /tmp")
GIT_CLONE_PATH = f"{common.GIT_CLONE_TEMP_DIR}/{name}"

# check that GIT_CLONE_PATH is set safely for deletion
if re.match(f'{common.GIT_CLONE_TEMP_DIR}/.+', GIT_CLONE_PATH) and \
re.match(rf'\/.+\/.+', GIT_CLONE_PATH):
pass
else:
sys.exit(f"could not determine that the temp cloning directory"
f"{GIT_CLONE_PATH} was set properly, exiting...")

print(f" - shallow cloning {name} from {clone_url} to {GIT_CLONE_PATH}")

# clone the repo locally
subprocess.run(["rm", "-fr", f"{common.GIT_CLONE_TEMP_DIR}/{name}"], check=True)
subprocess.run(["rm", "-fr", f"{GIT_CLONE_PATH}"], check=True)
subprocess.run(
["git", "clone", "--depth", "1", clone_url],
check=True,
Expand All @@ -56,9 +74,12 @@ def get_git_tree_from_clone(gh_repo):
capture_output=True,
check=True,
text=True,
cwd=f"{common.GIT_CLONE_TEMP_DIR}/{name}"
cwd=f"{GIT_CLONE_PATH}"
)

print(f" - removing cloned files in /tmp...")
subprocess.run(["rm", "-fr", f"{GIT_CLONE_PATH}"], check=True)

git_tree_lines = git_tree.stdout.splitlines()
print(f" - found {len(git_tree_lines)} tree items ...")

Expand All @@ -71,6 +92,18 @@ def get_git_tree_from_clone(gh_repo):

return tree_full_paths

def is_gh_repo_truncated(gh_tree_response) -> bool:
""" check if repo is truncated """
#pylint: disable=protected-access
return gh_tree_response._rawData['truncated']

def get_git_tree_from_api(repo_name, origin):
""" get git tree for repo via API call """
gh_client = get_github_client(origin)
gh_repo = get_github_repo(gh_client, repo_name)

return gh_repo.get_git_tree(gh_repo.default_branch, True)

def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
"""retrieve list of all supported manifests in a given github repo"""

Expand All @@ -79,29 +112,18 @@ def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
return state['manifests']

state['manifests'] = []
try:
if origin == 'github':
gh_repo = common.gh_client.get_repo(snyk_repo_name)
elif origin == 'github-enterprise':
gh_repo = common.gh_enterprise_client.get_repo(snyk_repo_name)
# pylint: disable=bare-except
except:
if origin == 'github':
gh_repo = common.gh_client.get_user().get_repo(snyk_repo_name)
elif origin == 'github-enterprise':
gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name)

tree_response = gh_repo.get_git_tree(gh_repo.default_branch, True)

tree_response = get_git_tree_from_api(snyk_repo_name, origin)

contents = tree_response.tree

#pylint: disable=protected-access
is_truncated_str = tree_response._rawData['truncated']
is_truncated_str = is_gh_repo_truncated(tree_response)

if is_truncated_str:
# repo too large to get try via API, just clone it
print(f" - Large repo detected, falling back to cloning. "
f"This may take a few minutes ...")
contents = get_git_tree_from_clone(gh_repo)
contents = get_git_tree_from_clone(snyk_repo_name, origin)
# print(f"tree contents: {contents}")

while contents:
Expand Down
20 changes: 20 additions & 0 deletions app/utils/github_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
github enterprise clients
"""
from github import Github
import common

# pylint: disable=invalid-name
def create_github_client(GITHUB_TOKEN, VERIFY_TLS):
Expand All @@ -23,3 +24,22 @@ def create_github_enterprise_client(GITHUB_ENTERPRISE_TOKEN, GITHUB_ENTERPRISE_H
raise RuntimeError(
"Failed to initialize GitHub client because GITHUB_ENTERPRISE_TOKEN is not set!"
) from err

def get_github_client(origin):
""" get the right github client depending on intergration type """
#pylint: disable=no-else-return
if origin == 'github':
return common.gh_client
elif origin == 'github-enterprise':
return common.gh_enterprise_client
else:
raise Exception(f"could not get github client for type: {origin}")

def get_github_repo(gh_client, repo_name):
""" get a github repo by name """
try:
return gh_client.get_repo(repo_name)
# pylint: disable=bare-except
except:
return gh_client.get_user().get_repo(repo_name)

7 changes: 7 additions & 0 deletions app/utils/snyk_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ def log_update_project_branch_error(org_name, project_id, project_name, new_bran
f"{project_id},"
f"{new_branch}\n")

def log_audit_large_repo_result(org_name: str, repo_name: str, is_large: str):
""" Log audit large repo result """
common.LARGE_REPOS_AUDIT_RESULTS_FILE.write(
f"{org_name},"
f"{repo_name},"
f"{is_large}\n")

def get_snyk_repos_from_snyk_orgs(snyk_orgs, ARGS):
"""Build list of repositories from a given list of Snyk orgs"""
snyk_repos = []
Expand Down
11 changes: 11 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
"%s_update-project-branches-errors.csv" % LOG_PREFIX, "w"
)
UPDATE_PROJECT_BRANCHES_ERRORS_FILE.write("org,project_name,project_id,new_branch\n")
LARGE_REPOS_AUDIT_RESULTS_FILE = open(
"%s_large-repos-audit-results.csv" % LOG_PREFIX, "w"
)
LARGE_REPOS_AUDIT_RESULTS_FILE.write("org,repo,is_large\n")

PENDING_REMOVAL_MAX_CHECKS = 45
PENDING_REMOVAL_CHECK_INTERVAL = 20
Expand Down Expand Up @@ -123,6 +127,13 @@ def parse_command_line_args():
required=False,
action="store_true",
)
parser.add_argument(
"--audit-large-repos",
help="only query github tree api to see if the response is truncated and \
log the result. These are the repos that would have be cloned via this tool",
required=False,
action="store_true",
)
parser.add_argument(
"--debug",
help="Write detailed debug data to snyk_scm_refresh.log for troubleshooting",
Expand Down
5 changes: 4 additions & 1 deletion snyk_scm_refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

if __name__ == "__main__":

if common.ARGS.dry_run:
if common.ARGS.audit_large_repos:
print("\n****** AUDIT LARGE REPOS MODE ******\n")
print(f"check {common.LARGE_REPOS_AUDIT_RESULTS_FILE.name} after script completes\n")
elif common.ARGS.dry_run:
print("\n****** DRY-RUN MODE ******\n")
for arg in vars(common.ARGS):
if any(arg in x for x in ['sca', 'container', 'iac', 'code']):
Expand Down

0 comments on commit 5d5fde7

Please sign in to comment.