Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add github actions workflow to crawl docs pages for broken URLs #105

Merged
merged 3 commits into from
Jul 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/url_check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Broken URL Check

on:
schedule:
# Runs every Monday at 00:00 UTC
- cron: '0 0 * * 1'
workflow_dispatch: # Allows manual triggering of the workflow

defaults:
run:
shell: bash

jobs:
url-check:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests

- name: Run link checker
run: |
python scripts/urlcheck.py
60 changes: 60 additions & 0 deletions scripts/urlcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
import re
import requests
import socket

def check_url_status(url):
try:
response = requests.head(url, allow_redirects=True, timeout=5)
return response.status_code, response.reason
except requests.RequestException as e:
return None, str(e)

def find_urls(text):
# Only match valid URLs starting with http:// or https://
url_pattern = re.compile(r'https?://[^\s"\'<>\)]*')
return url_pattern.findall(text)

def is_valid_url(url):
try:
domain = re.findall(r'://([^/]+)', url)[0]
socket.gethostbyname(domain) # Check if domain resolves to an IP
return True
except (socket.gaierror, IndexError):
return False

def check_files_in_directory(directory):
report = []

for root, _, files in os.walk(directory):
for file in files:
if file.endswith(('.md', '.mdx')): # Check both .md and .mdx files
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f, 1):
urls = find_urls(line)
for url in urls:
if is_valid_url(url):
status_code, reason = check_url_status(url)
# Exclude specific status codes from report
if status_code and status_code not in {200, 403, 415}:
report.append({
'file': file_path,
'line': line_number,
'url': url,
'status_code': status_code,
'reason': reason
})
return report

def generate_report(report):
for item in report:
print(f"File: {item['file']}, Line: {item['line']}")
print(f"URL: {item['url']}")
print(f"Status Code: {item['status_code']}, Reason: {item['reason']}")
print("-" * 40)

if __name__ == "__main__":
check_path = './pages/' # path to check
report = check_files_in_directory(check_path)
generate_report(report)
Loading