Skip to content
This repository has been archived by the owner on Oct 26, 2023. It is now read-only.

Fix/handle large repos #90

Merged
merged 1 commit into from
Apr 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ If using the Snyk Github Enterprise Integration type for your Github.com reposit
1. In GitHub.com browse: https://github.com/settings/tokens/new. Or in GitHub Enterprise select your user icon (top-right), then 'Settings', then 'Developer settings', then 'Personal access tokens'.
2. Scopes - Public repos do not need a scope. If you want to scan private repos, then you'll need to enable this scope: `repo` (Full control of private repositories)

## Handling self-signed certificates
### Handling self-signed certificates
This tool uses the python requests library, therefore you can point [REQUESTS_CA_BUNDLE](https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification) environment variable to the location of your cert bundle

`export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt`
Expand Down Expand Up @@ -120,3 +120,19 @@ Use the `--dry-run` option to verify the execution plan for the first run
| _updated-project-branches.csv | projects with updated default branch |
| _update-project-branches-errors.csv | projects that had an error attempting to update default branch |
| _repos-skipped-on-error.csv | repos skipped due to import error |

## Handling of large repositories
The primary method used by this tool to retrieve the GIT tree from each repository for the basis of comparison is via the Github API.
For sufficiently large repositories, though, Github truncates the API response. When a truncated Github response is detected when retrieving the GIT tree,
this tool will fall back on using the local `git` if available and configured to perform a shallow clone of the repository's default branch in order to build the tree.

It will use /tmp to perform the `git clone` and then capture the output of `git ls-tree -r`

When this situation occurs, you will see the following in the console:
```
Large repo detected, falling back to cloning. This may take a few minutes ...
```

![image](https://user-images.githubusercontent.com/59706011/163878251-e874b073-eab6-48c0-9bd3-ea995005e4a9.png)

The truncated GIT tree response is described [here](https://docs.github.com/en/rest/reference/git#get-a-tree). The last [known limits](https://github.community/t/github-get-tree-api-limits-and-recursivity/1300/2) are: 100,000 files or 7 MB of response data, whichever is first.
2 changes: 1 addition & 1 deletion app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def run():

app_print(snyk_repo.org_name,
snyk_repo.full_name,
"Looking for new manifests in code repository")
"Checking for new manifests in source tree")

#if not common.ARGS.dry_run:
projects_import = snyk_repo.add_new_manifests(common.ARGS.dry_run)
Expand Down
107 changes: 98 additions & 9 deletions app/gh_repo.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,84 @@
"""utilities for github"""
import logging
import re
import subprocess
import requests
from app.models import GithubRepoStatus
import common


# suppess InsecureRequestWarning when using --skip-scm-validation option
# due to pylint bug
# https://github.com/PyCQA/pylint/issues/4584)
# pylint: disable=no-member
requests.packages.urllib3.disable_warnings()

# pylint: disable=invalid-name
state = {
"tree_already_retrieved": False,
"manifests": []
}

def get_git_tree_from_clone(gh_repo):
"""
get git tree for large repos by performing
a shallow clone 'git clone --depth 1'
"""

tree_full_paths = []

# check if git exists on the system
subprocess.run(["command", "-v", "git"], check=True, stdout=subprocess.DEVNULL)

name = gh_repo.name
clone_url = gh_repo.clone_url
default_branch = gh_repo.default_branch

print(f" - shallow cloning {name} from {clone_url} to /tmp")

# clone the repo locally
subprocess.run(["rm", "-fr", f"{common.GIT_CLONE_TEMP_DIR}/{name}"], check=True)
subprocess.run(
["git", "clone", "--depth", "1", clone_url],
check=True,
cwd=common.GIT_CLONE_TEMP_DIR
)

print(" - Loading tree from local git structure")

git_tree = subprocess.run(
[
"git",
"ls-tree",
"-r",
default_branch
],
capture_output=True,
check=True,
text=True,
cwd=f"{common.GIT_CLONE_TEMP_DIR}/{name}"
)

git_tree_lines = git_tree.stdout.splitlines()
print(f" - found {len(git_tree_lines)} tree items ...")

for line in git_tree_lines:
sha, path = [line.split()[i] for i in (2, 3)]
tree_full_paths.append({
"sha": sha,
"path": path
})

return tree_full_paths

def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
"""retrieve list of all supported manifests in a given github repo"""
manifests = []

if state['tree_already_retrieved']:
state['tree_already_retrieved'] = False
return state['manifests']

state['manifests'] = []
try:
if origin == 'github':
gh_repo = common.gh_client.get_repo(snyk_repo_name)
Expand All @@ -22,20 +87,44 @@ def get_repo_manifests(snyk_repo_name, origin, skip_snyk_code):
# pylint: disable=bare-except
except:
if origin == 'github':
gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name)
gh_repo = common.gh_client.get_user().get_repo(snyk_repo_name)
elif origin == 'github-enterprise':
gh_repo = common.gh_enterprise_client.get_user().get_repo(snyk_repo_name)

contents = gh_repo.get_git_tree(gh_repo.default_branch, True).tree
tree_response = gh_repo.get_git_tree(gh_repo.default_branch, True)
contents = tree_response.tree

#pylint: disable=protected-access
is_truncated_str = tree_response._rawData['truncated']

if is_truncated_str:
# repo too large to get try via API, just clone it
print(f" - Large repo detected, falling back to cloning. "
f"This may take a few minutes ...")
contents = get_git_tree_from_clone(gh_repo)
# print(f"tree contents: {contents}")

while contents:
file_content = contents.pop(0)
if passes_manifest_filter(file_content.path, skip_snyk_code):
manifests.append(file_content.path)
if re.match(common.MANIFEST_PATTERN_CODE, file_content.path):
tree_element = contents.pop(0)
# print(f"tree_element: {tree_element}")
if is_truncated_str:
tree_element_sha = tree_element['sha']
tree_element_path = tree_element['path']
else:
tree_element_sha = tree_element.sha
tree_element_path = tree_element.path
full_path = {
"sha": tree_element_sha,
"path": tree_element_path
}
if passes_manifest_filter(full_path['path'], skip_snyk_code):
#print(f"appending to manifests to check")
state['manifests'].append(full_path['path'])
if re.match(common.MANIFEST_PATTERN_CODE, full_path['path']):
skip_snyk_code = True
#print(manifests)
return manifests

state['tree_already_retrieved'] = True
return state['manifests']

def passes_manifest_filter(path, skip_snyk_code=False):
""" check if given path should be imported based
Expand Down
1 change: 1 addition & 0 deletions app/snyk_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def add_new_manifests(self, dry_run):
gh_repo_manifests = get_repo_manifests(self.full_name, self.origin, self.has_snyk_code())

for gh_repo_manifest in gh_repo_manifests:
#print(f"checking to import: {gh_repo_manifest}")
if gh_repo_manifest not in {sp['manifest'] for sp in self.snyk_projects}:
files.append(dict({"path": gh_repo_manifest}))

Expand Down
2 changes: 2 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
GITHUB_ENTERPRISE_TOKEN = getenv("GITHUB_ENTERPRISE_TOKEN")
GITHUB_ENTERPRISE_HOST = getenv("GITHUB_ENTERPRISE_HOST")

GIT_CLONE_TEMP_DIR = "/tmp"

LOG_PREFIX = "snyk-scm-refresh"
LOG_FILENAME = LOG_PREFIX + ".log"
POTENTIAL_DELETES_FILE = open("%s_potential-repo-deletes.csv" % LOG_PREFIX, "w")
Expand Down