From f28bd772ec933f9519bb0d8703d5368473fac2f5 Mon Sep 17 00:00:00 2001 From: cordt-sei <165932662+cordt-sei@users.noreply.github.com> Date: Thu, 25 Jul 2024 22:06:33 -0600 Subject: [PATCH 1/3] Create url_check.yml Adds workflow for weekly broken URLs check --- .github/workflows/url_check.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/url_check.yml diff --git a/.github/workflows/url_check.yml b/.github/workflows/url_check.yml new file mode 100644 index 00000000..a7d88418 --- /dev/null +++ b/.github/workflows/url_check.yml @@ -0,0 +1,33 @@ +name: Broken URL Check + +on: + schedule: + # Runs every Monday at 00:00 UTC + - cron: '0 0 * * 1' + workflow_dispatch: # Allows manual triggering of the workflow + +defaults: + run: + shell: bash + +jobs: + url-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests beautifulsoup4 + + - name: Run checker + run: | + python scripts/urlcheck.py From 6e757852ca87776422b6dfcf45c215f27f9bdd4e Mon Sep 17 00:00:00 2001 From: cordt-sei <165932662+cordt-sei@users.noreply.github.com> Date: Thu, 25 Jul 2024 22:21:23 -0600 Subject: [PATCH 2/3] Create urlcheck.py Workflow/actions script to check for broken URLs on a regular schedule. --- scripts/urlcheck.py | 60 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scripts/urlcheck.py diff --git a/scripts/urlcheck.py b/scripts/urlcheck.py new file mode 100644 index 00000000..d4aed890 --- /dev/null +++ b/scripts/urlcheck.py @@ -0,0 +1,60 @@ +import os +import re +import requests +import socket + +def check_url_status(url): + try: + response = requests.head(url, allow_redirects=True, timeout=5) + return response.status_code, response.reason + except requests.RequestException as e: + return None, str(e) + +def find_urls(text): + # Only match valid URLs starting with http:// or https:// + url_pattern = re.compile(r'https?://[^\s"\'<>\)]*') + return url_pattern.findall(text) + +def is_valid_url(url): + try: + domain = re.findall(r'://([^/]+)', url)[0] + socket.gethostbyname(domain) # Check if domain resolves to an IP + return True + except (socket.gaierror, IndexError): + return False + +def check_files_in_directory(directory): + report = [] + + for root, _, files in os.walk(directory): + for file in files: + if file.endswith(('.md', '.mdx')): # Check both .md and .mdx files + file_path = os.path.join(root, file) + with open(file_path, 'r', encoding='utf-8') as f: + for line_number, line in enumerate(f, 1): + urls = find_urls(line) + for url in urls: + if is_valid_url(url): + status_code, reason = check_url_status(url) + # Exclude specific status codes from report + if status_code and status_code not in {200, 403, 415}: + report.append({ + 'file': file_path, + 'line': line_number, + 'url': url, + 'status_code': status_code, + 'reason': reason + }) + return report + +def generate_report(report): + for item in report: + print(f"File: {item['file']}, Line: {item['line']}") + print(f"URL: {item['url']}") + print(f"Status Code: {item['status_code']}, Reason: {item['reason']}") + print("-" * 40) + +if __name__ == "__main__": + check_path = './pages/' # path to check + report = check_files_in_directory(check_path) + generate_report(report) From 76d23108db122709901328f233daa3b68add758c Mon Sep 17 00:00:00 2001 From: cordt-sei <165932662+cordt-sei@users.noreply.github.com> Date: Thu, 25 Jul 2024 22:22:42 -0600 Subject: [PATCH 3/3] Update url_check.yml edit script name --- .github/workflows/url_check.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/url_check.yml b/.github/workflows/url_check.yml index a7d88418..3c3bea2a 100644 --- a/.github/workflows/url_check.yml +++ b/.github/workflows/url_check.yml @@ -4,7 +4,7 @@ on: schedule: # Runs every Monday at 00:00 UTC - cron: '0 0 * * 1' - workflow_dispatch: # Allows manual triggering of the workflow + workflow_dispatch: # Allows manual triggering of the workflow defaults: run: @@ -26,8 +26,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install requests beautifulsoup4 + pip install requests - - name: Run checker + - name: Run link checker run: | python scripts/urlcheck.py