Skip to content

Commit

Permalink
Merge pull request #112 from sei-protocol/workflows-1
Browse files Browse the repository at this point in the history
Improved URL checking workflow
  • Loading branch information
cordt-sei authored Aug 8, 2024
2 parents 8042ca9 + dd6d16f commit 21fb412
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 43 deletions.
47 changes: 40 additions & 7 deletions .github/workflows/url_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,9 @@ name: Broken URL Check

on:
schedule:
# Runs every Monday at 00:00 UTC
- cron: '0 0 * * 1'
- cron: '0 0 * * 1' # Runs every Monday at 00:00 UTC
workflow_dispatch: # Allows manual triggering of the workflow

defaults:
run:
shell: bash

jobs:
url-check:
runs-on: ubuntu-latest
Expand All @@ -29,5 +24,43 @@ jobs:
pip install requests
- name: Run link checker
id: run_checker
run: |
python scripts/urlcheck.py
output=$(python scripts/urlcheck.py)
echo "checker_output<<EOF" >> $GITHUB_OUTPUT
echo "$output" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
env:
CHECK_PATH: './pages/'

- name: Create Issue with Results
if: always()
uses: actions/github-script@v6
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const output = JSON.parse('${{ steps.run_checker.outputs.checker_output }}');
const issueTitle = output.status === "issues_found"
? `Broken URLs Detected: ${output.total_issues} issues found`
: "URL Check Completed - No Issues Found";
let issueBody = `# URL Check Results\n\n`;
if (output.status === "issues_found") {
issueBody += `## Issues Found: ${output.total_issues}\n\n`;
output.issues.forEach(issue => {
issueBody += `- File: ${issue.file}, Line: ${issue.line}\n`;
issueBody += ` URL: ${issue.url}\n`;
issueBody += ` Status Code: ${issue.status_code}, Reason: ${issue.reason}\n`;
issueBody += ` Final URL: ${issue.final_url}\n\n`;
});
} else {
issueBody += "No broken URLs detected in this check.";
}
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.name,
title: issueTitle,
body: issueBody,
labels: ['url-check']
});
100 changes: 64 additions & 36 deletions scripts/urlcheck.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,88 @@
import os
import re
import requests
import socket
import json
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

INTERNAL_404_URL = "https://github.com/sei-protocol/sei-docs/blob/main/pages/404.mdx"
MAX_WORKERS = 5 # Adjust based on your needs and GitHub Actions limitations

def check_url_status(url):
try:
response = requests.head(url, allow_redirects=True, timeout=5)
return response.status_code, response.reason
return response.status_code, response.reason, response.url
except requests.RequestException as e:
return None, str(e)
return None, str(e), None

def find_urls(text):
# Only match valid URLs starting with http:// or https://
url_pattern = re.compile(r'https?://[^\s"\'<>\)]*')
return url_pattern.findall(text)

def is_valid_url(url):
try:
domain = re.findall(r'://([^/]+)', url)[0]
socket.gethostbyname(domain) # Check if domain resolves to an IP
return True
except (socket.gaierror, IndexError):
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False

def process_file(file_path):
file_report = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f, 1):
urls = find_urls(line)
for url in urls:
if is_valid_url(url):
status_code, reason, final_url = check_url_status(url)
if status_code and (status_code not in {200, 403, 415} or final_url == INTERNAL_404_URL):
file_report.append({
'file': file_path,
'line': line_number,
'url': url,
'status_code': status_code,
'reason': reason,
'final_url': final_url
})
except IOError as e:
print(f"Error reading file {file_path}: {str(e)}")
return file_report

def check_files_in_directory(directory):
report = []

for root, _, files in os.walk(directory):
for file in files:
if file.endswith(('.md', '.mdx')): # Check both .md and .mdx files
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f, 1):
urls = find_urls(line)
for url in urls:
if is_valid_url(url):
status_code, reason = check_url_status(url)
# Exclude specific status codes from report
if status_code and status_code not in {200, 403, 415}:
report.append({
'file': file_path,
'line': line_number,
'url': url,
'status_code': status_code,
'reason': reason
})
return report
all_reports = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_file = {}
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(('.md', '.mdx')):
file_path = os.path.join(root, file)
future = executor.submit(process_file, file_path)
future_to_file[future] = file_path
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
report = future.result()
all_reports.extend(report)
except Exception as exc:
print(f'{file_path} generated an exception: {exc}')
return all_reports

def generate_report(report):
for item in report:
print(f"File: {item['file']}, Line: {item['line']}")
print(f"URL: {item['url']}")
print(f"Status Code: {item['status_code']}, Reason: {item['reason']}")
print("-" * 40)
output = {}
if report:
output["status"] = "issues_found"
output["total_issues"] = len(report)
output["issues"] = report
else:
output["status"] = "no_issues_found"
output["total_issues"] = 0

print(json.dumps(output, indent=2))

if __name__ == "__main__":
check_path = './pages/' # path to check
check_path = os.environ.get('CHECK_PATH', './pages/')
report = check_files_in_directory(check_path)
generate_report(report)

# Set exit code for GitHub Actions
exit(len(report)) # Exit code is the number of issues found

0 comments on commit 21fb412

Please sign in to comment.