Skip to content

Commit

Permalink
Add support to upload/download files to a (private) GitHub repo
Browse files Browse the repository at this point in the history
  • Loading branch information
MShekow committed Nov 10, 2024
1 parent a632f6b commit 07029ce
Show file tree
Hide file tree
Showing 9 changed files with 771 additions and 92 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ jobs:
source .venv/bin/activate
poetry install --with test
pytest tests
env:
GITHUB_INTEGRATION_TEST_PAT: ${{ secrets.GH_INTEGRATION_TEST_PAT }}
GITHUB_VALID_OWNER_REPO_BRANCH: "MShekow/ng-outlook-google-calendar-sync/github-integration-test"
1 change: 1 addition & 0 deletions calendar_sync_helper/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
MAX_FILE_SIZE_LIMIT_BYTES = 5_000_000
91 changes: 91 additions & 0 deletions calendar_sync_helper/github_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import base64
import re
from datetime import datetime
from typing import Tuple, Optional

import github
from github.Repository import Repository

from calendar_sync_helper.constants import MAX_FILE_SIZE_LIMIT_BYTES


class GitHubClient:
def __init__(self, url: str, personal_access_token: str):
self._owner, self._repo, self._branch, self._path = self._extract_github_credentials(url)
self._github_client = github.Github(auth=github.Auth.Token(personal_access_token))

def check_data_and_pat_validity(self):
# Make an actual request to verify that the PAT (and other data) is valid
self._github_client.get_repo(f"{self._owner}/{self._repo}")

def upload_file(self, content: bytes):
repository = self._github_client.get_repo(f"{self._owner}/{self._repo}")

commit_message = f"Upload calendar data: {datetime.now().isoformat()}"

if file_sha_and_size := self._get_sha_and_size_of_file(repository):
file_sha, _size = file_sha_and_size
repository.update_file(self._path, commit_message, content, file_sha, branch=self._branch)
else:
repository.create_file(self._path, commit_message, content, branch=self._branch)

def download_file(self) -> bytes:
"""
Downloads the file, or returns an empty bytes object if the file cannot be found. Raises if something
unexpected goes wrong, or if the file is too big.
"""
repository = self._github_client.get_repo(f"{self._owner}/{self._repo}")
# Note: normally, we should call contents = repository.get_contents(self._path, ref=self._branch)
# and then return "contents.decoded_content". But that did not work in our experiments. For instance, for
# a binary file with the following content:
# b'\x1ba_\x127\x18$Of\xb9\xa0\x8f\x07[\xa9N\xcf\xa5\xa5}-\xf1{\x04\xac\x8c\x96\rv\x9b\x9ed\xf7y\xf2U\x0e\t\xe3\xe0\xdeo\xb4\x0e\x8b\x8f\x99T\xd3\xa1\xc2|\xea\x0f\xe4\xc26\xa2\x1a@'
# (or, base64-encoded: 'G2FfEjcYJE9muaCPB1upTs+lpX0t8XsErIyWDXabnmT3efJVDgnj4N5vtA6Lj5lU06HCfOoP5MI2ohpA')
# the value of contents.decoded_content would be wrong (too long). The contents b64-encoded data would also
# be different:
# 'G2FfEjcYJE9mwrnCoMKPB1vCqU7Dj8KlwqV9LcOxewTCrMWS4oCTDXbigLrF\nvmTDt3nDslUOCcOjw6DDnm/CtA7igLnCj+KEolTDk8Khw4J8w6oPw6TDgjbC\nohpA\n'
# For that reason, we instead use the repository.get_git_blob() approach, which seems to work properly.
file_sha_and_size = self._get_sha_and_size_of_file(repository)
if not file_sha_and_size:
return bytes()

file_sha, file_size = file_sha_and_size
if file_size > MAX_FILE_SIZE_LIMIT_BYTES:
raise ValueError(f"Content is too large ({file_size} bytes)")

blob = repository.get_git_blob(file_sha)
return base64.b64decode(blob.raw_data["content"])

def delete_file(self):
# Only used by integration test code
repository = self._github_client.get_repo(f"{self._owner}/{self._repo}")
file_sha_and_size = self._get_sha_and_size_of_file(repository)

if not file_sha_and_size:
raise FileNotFoundError(f"File {self._path} not found in the repository.")

file_sha, _ = file_sha_and_size
commit_message = f"Delete calendar data: {datetime.now().isoformat()}"
repository.delete_file(self._path, commit_message, file_sha, branch=self._branch)

@staticmethod
def _extract_github_credentials(url: str) -> Tuple[str, str, str, str]:
pattern = r"https://github\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/(?P<branch>[^/]+)/(?P<path>.+)"
match = re.match(pattern, url)

if match:
owner = match.group("owner")
repo = match.group("repo")
branch = match.group("branch")
path = match.group("path")
return owner, repo, branch, path
else:
raise ValueError("URL does not match the expected pattern: "
"https://github.com/<owner>/<repo>/<branch>/<path>")

def _get_sha_and_size_of_file(self, repository: Repository) -> Optional[Tuple[str, int]]:
branch_ref = repository.get_git_ref(f"heads/{self._branch}")
base_tree = repository.get_git_tree(branch_ref.object.sha, recursive=True)

for elem in base_tree.tree:
if elem.path == self._path:
return elem.sha, elem.size
105 changes: 26 additions & 79 deletions calendar_sync_helper/routers/router_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,20 @@
from copy import copy
from typing import Annotated

import httpx
import validators
from cryptography.exceptions import InvalidTag
from fastapi import Header, HTTPException, APIRouter
from fastapi.encoders import jsonable_encoder

from calendar_sync_helper.cryptography_utils import decrypt, encrypt
from calendar_sync_helper.cryptography_utils import decrypt
from calendar_sync_helper.entities.entities_v1 import CalendarEventList, OutlookCalendarEvent, AbstractCalendarEvent, \
ComputeActionsInput, GoogleCalendarEvent, ComputeActionsResponse
from calendar_sync_helper.utils import is_syncblocker_event, separate_syncblocker_events, get_id_from_attendees, \
build_syncblocker_attendees, get_syncblocker_title, fix_outlook_specific_field_defaults, get_boolean_header_value, \
is_valid_sync_prefix, clean_id, filter_outdated_events, has_matching_title
is_valid_sync_prefix, clean_id, filter_outdated_events, has_matching_title, download_file_contents, \
upload_file_contents

router = APIRouter()

MAX_FILE_SIZE_LIMIT_BYTES = 5_000_000


# Note: FastAPI behavior for headers that are set by the client, but contain no value (other than 0 or more spaces):
# FastAPI sets the header value to an empty string!
Expand All @@ -45,55 +42,29 @@ async def retrieve_calendar_file_proxy(
if not validators.url(x_file_location) or not x_file_location.startswith("http"):
raise HTTPException(status_code=400, detail="Invalid file location, must be a valid http(s) URL")

binary_data, encoding_hint = await download_file_contents(x_file_location, x_auth_header_name, x_auth_header_value)

if x_data_encryption_password:
try:
file_as_text = decrypt(binary_data, password=x_data_encryption_password)
except InvalidTag:
raise HTTPException(status_code=400, detail="Unable to decrypt data, either wrong password "
"or data was manipulated")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Unable to decrypt data, unexpected error "
f"occurred: {e!r}")
else:
try:
file_as_text = binary_data.decode(encoding_hint or "utf-8")
except UnicodeDecodeError:
raise HTTPException(status_code=400, detail="Unable to decode binary response data to text")

try:
async with httpx.AsyncClient() as client:
headers = {
x_auth_header_name: x_auth_header_value
}
async with client.stream("GET", x_file_location, headers=headers, follow_redirects=True) as response:
if response.status_code != 200:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file, "
f"response status: {response.status_code}")

content_length = response.headers.get("Content-Length")
if content_length and int(content_length) > MAX_FILE_SIZE_LIMIT_BYTES:
raise HTTPException(status_code=413, detail="File size exceeds maximum size limit")

# Validate that the response content is a valid JSON
try:
await response.aread()
except Exception as e:
raise HTTPException(status_code=400, detail=f"Unable to read data stream from "
f"file location: {e!r}")

if x_data_encryption_password:
try:
file_as_text = decrypt(response.content, password=x_data_encryption_password)
except InvalidTag:
raise HTTPException(status_code=400, detail="Unable to decrypt data, either wrong password "
"or data was manipulated")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Unable to decrypt data, unexpected error "
f"occurred: {e!r}")
else:
try:
# Note: just calling response.json() may try to use an incorrect decoding,
# e.g. utf-8 where another one must be used --> response.text takes care of the proper decoding
file_as_text = response.text
except Exception as e:
raise HTTPException(status_code=400, detail=f"Unable to decode binary response data to "
f"text: {e!r}")

try:
json_content = json.loads(file_as_text)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to parse JSON content: {e!r}")

return json_content
json_content = json.loads(file_as_text)
except Exception as e:
if type(e) == HTTPException:
raise e
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: {e!r}")
raise HTTPException(status_code=400, detail=f"Failed to parse JSON content: {e!r}")

return json_content


@router.post("/extract-events")
Expand Down Expand Up @@ -167,32 +138,8 @@ async def extract_events(
if not x_upload_http_method or x_upload_http_method.lower() not in ["put", "post"]:
raise HTTPException(status_code=400, detail="Invalid upload method, must be PUT or POST")

try:
async with httpx.AsyncClient() as client:
headers = dict()
if x_auth_header_name and x_auth_header_value:
headers[x_auth_header_name] = x_auth_header_value

events_as_json_dict = jsonable_encoder(events)

if x_data_encryption_password:
encrypted_content = encrypt(plaintext=json.dumps(events_as_json_dict),
password=x_data_encryption_password)
json_data = None
else:
encrypted_content = None
json_data = events_as_json_dict

response = await client.request(x_upload_http_method, url=x_file_location, headers=headers,
json=json_data, content=encrypted_content)

if response.status_code < 200 or response.status_code > 204:
raise HTTPException(status_code=400, detail=f"Failed to upload file, "
f"response status: {response.status_code}")
except Exception as e:
if type(e) == HTTPException:
raise e
raise HTTPException(status_code=400, detail=f"Failed to upload file: {e!r}")
await upload_file_contents(events, x_file_location, x_upload_http_method, x_auth_header_name,
x_auth_header_value, x_data_encryption_password)

return events

Expand Down
128 changes: 127 additions & 1 deletion calendar_sync_helper/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
import json
import re
from datetime import UTC, datetime
from typing import Optional
from typing import Optional, Tuple

import httpx
from fastapi import HTTPException
from fastapi.encoders import jsonable_encoder
from github import GithubException

from calendar_sync_helper.constants import MAX_FILE_SIZE_LIMIT_BYTES
from calendar_sync_helper.cryptography_utils import encrypt
from calendar_sync_helper.entities.entities_v1 import ImplSpecificEvent, GoogleCalendarEvent, AbstractCalendarEvent, \
ComputeActionsInput
from calendar_sync_helper.github_client import GitHubClient


def _get_actual_utc_datetime() -> datetime:
Expand Down Expand Up @@ -172,3 +179,122 @@ def filter_outdated_events(input_data: ComputeActionsInput):
input_data.cal1events = \
[e for e in input_data.cal1events if AbstractCalendarEvent.from_implementation(e).start >= now]
input_data.cal2events = [e for e in input_data.cal2events if e.start >= now]


async def download_file_contents(x_file_location: str, x_auth_header_name: str,
x_auth_header_value: str) -> Tuple[bytes, Optional[str]]:
"""
Downloads the data and returns it as raw bytes, with an optional hint of the encoding (e.g. "utf-8").
Raises HTTPException if something goes wrong.
"""
try:
if x_file_location.startswith("https://github.com/"):
try:
github_client = GitHubClient(url=x_file_location, personal_access_token=x_auth_header_value)
except ValueError as e: # GitHub URL is malformed
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: {e}")

try:
github_client.check_data_and_pat_validity()
except GithubException as e:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: invalid GitHub PAT or owner/repo "
f"was provided. Status {e.status} was returned, with "
f"message '{e.message}'")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: unexpected error while checking "
f"GitHub data and PAT: {e!r}")

try:
binary_data = github_client.download_file() # the file at <path> could not exist
except GithubException as e:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: downloading file from GitHub "
f"failed: {e.message or e.status}")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: unexpected error occurred while "
f"downloading file from GitHub: {e!r}")

if not binary_data:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: downloading file from GitHub "
f"failed, it does not exist")

return binary_data, None
else:
async with httpx.AsyncClient() as client:
headers = {
x_auth_header_name: x_auth_header_value
}
async with client.stream("GET", x_file_location, headers=headers, follow_redirects=True) as response:
if response.status_code != 200:
raise HTTPException(status_code=400, detail=f"Failed to retrieve file, "
f"response status: {response.status_code}")

content_length = response.headers.get("Content-Length")
if content_length and int(content_length) > MAX_FILE_SIZE_LIMIT_BYTES:
raise HTTPException(status_code=413, detail="File size exceeds maximum size limit")

# Validate that the response content is a valid JSON
try:
await response.aread()
except Exception as e:
raise HTTPException(status_code=400, detail=f"Unable to read data stream from "
f"file location: {e!r}")

return response.content, response.encoding
except Exception as e:
if type(e) == HTTPException:
raise e
raise HTTPException(status_code=400, detail=f"Failed to retrieve file: {e!r}")


async def upload_file_contents(events: list[AbstractCalendarEvent], x_file_location: str, x_upload_http_method: str,
x_auth_header_name: Optional[str], x_auth_header_value: Optional[str],
x_data_encryption_password: Optional[str]):
try:
events_as_json_dict = jsonable_encoder(events)

if x_data_encryption_password:
content = encrypt(plaintext=json.dumps(events_as_json_dict),
password=x_data_encryption_password)
else:
content = json.dumps(events_as_json_dict).encode("utf-8")

if x_file_location.startswith("https://github.com/"):
try:
github_client = GitHubClient(url=x_file_location, personal_access_token=x_auth_header_value)
except ValueError as e: # GitHub URL is malformed
raise HTTPException(status_code=400, detail=f"Failed to upload file: {e}")

try:
github_client.check_data_and_pat_validity()
except GithubException as e:
raise HTTPException(status_code=400,
detail=f"Failed to upload file: invalid GitHub PAT or owner/repo "
f"was provided. Status {e.status} was returned, with "
f"message '{e.message}'")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to upload file: unexpected error while checking "
f"GitHub data and PAT: {e!r}")

try:
github_client.upload_file(content)
except GithubException as e:
raise HTTPException(status_code=400, detail=f"Failed to upload file: {e.message}")
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to upload file: unexpected error occurred while "
f"uploading file to GitHub: {e!r}")
else:
async with httpx.AsyncClient() as client:
headers = dict()
if x_auth_header_name and x_auth_header_value:
headers[x_auth_header_name] = x_auth_header_value

response = await client.request(x_upload_http_method, url=x_file_location, headers=headers,
content=content)

if response.status_code < 200 or response.status_code > 204:
raise HTTPException(status_code=400, detail=f"Failed to upload file, "
f"response status: {response.status_code}")
except Exception as e:
if type(e) == HTTPException:
raise e
raise HTTPException(status_code=400, detail=f"Failed to upload file: {e!r}")
Loading

0 comments on commit 07029ce

Please sign in to comment.