Merge pull request #112 from sei-protocol/workflows-1

Improved URL checking workflow
sei-protocol · Aug 8, 2024 · 21fb412 · 21fb412
2 parents 8042ca9 + dd6d16f
commit 21fb412
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 43 deletions.
diff --git a/.github/workflows/url_check.yml b/.github/workflows/url_check.yml
@@ -2,14 +2,9 @@ name: Broken URL Check
 
 on:
   schedule:
-    # Runs every Monday at 00:00 UTC
-    - cron: '0 0 * * 1'
+    - cron: '0 0 * * 1'  # Runs every Monday at 00:00 UTC
   workflow_dispatch:  # Allows manual triggering of the workflow
 
-defaults:
-  run:
-    shell: bash
-
 jobs:
   url-check:
     runs-on: ubuntu-latest
@@ -29,5 +24,43 @@ jobs:
           pip install requests
 
       - name: Run link checker
+        id: run_checker
         run: |
-          python scripts/urlcheck.py
+          output=$(python scripts/urlcheck.py)
+          echo "checker_output<<EOF" >> $GITHUB_OUTPUT
+          echo "$output" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+        env:
+          CHECK_PATH: './pages/'
+
+      - name: Create Issue with Results
+        if: always()
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const output = JSON.parse('${{ steps.run_checker.outputs.checker_output }}');
+            const issueTitle = output.status === "issues_found" 
+              ? `Broken URLs Detected: ${output.total_issues} issues found`
+              : "URL Check Completed - No Issues Found";
+            
+            let issueBody = `# URL Check Results\n\n`;
+            if (output.status === "issues_found") {
+              issueBody += `## Issues Found: ${output.total_issues}\n\n`;
+              output.issues.forEach(issue => {
+                issueBody += `- File: ${issue.file}, Line: ${issue.line}\n`;
+                issueBody += `  URL: ${issue.url}\n`;
+                issueBody += `  Status Code: ${issue.status_code}, Reason: ${issue.reason}\n`;
+                issueBody += `  Final URL: ${issue.final_url}\n\n`;
+              });
+            } else {
+              issueBody += "No broken URLs detected in this check.";
+            }
+            
+            await github.rest.issues.create({
+              owner: context.repo.owner,
+              repo: context.repo.name,
+              title: issueTitle,
+              body: issueBody,
+              labels: ['url-check']
+            });
diff --git a/scripts/urlcheck.py b/scripts/urlcheck.py
@@ -1,60 +1,88 @@
 import os
 import re
 import requests
-import socket
+import json
+from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+INTERNAL_404_URL = "https://github.com/sei-protocol/sei-docs/blob/main/pages/404.mdx"
+MAX_WORKERS = 5  # Adjust based on your needs and GitHub Actions limitations
 
 def check_url_status(url):
     try:
         response = requests.head(url, allow_redirects=True, timeout=5)
-        return response.status_code, response.reason
+        return response.status_code, response.reason, response.url
     except requests.RequestException as e:
-        return None, str(e)
+        return None, str(e), None
 
 def find_urls(text):
-    # Only match valid URLs starting with http:// or https://
     url_pattern = re.compile(r'https?://[^\s"\'<>\)]*')
     return url_pattern.findall(text)
 
 def is_valid_url(url):
     try:
-        domain = re.findall(r'://([^/]+)', url)[0]
-        socket.gethostbyname(domain)  # Check if domain resolves to an IP
-        return True
-    except (socket.gaierror, IndexError):
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except ValueError:
         return False
 
+def process_file(file_path):
+    file_report = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_number, line in enumerate(f, 1):
+                urls = find_urls(line)
+                for url in urls:
+                    if is_valid_url(url):
+                        status_code, reason, final_url = check_url_status(url)
+                        if status_code and (status_code not in {200, 403, 415} or final_url == INTERNAL_404_URL):
+                            file_report.append({
+                                'file': file_path,
+                                'line': line_number,
+                                'url': url,
+                                'status_code': status_code,
+                                'reason': reason,
+                                'final_url': final_url
+                            })
+    except IOError as e:
+        print(f"Error reading file {file_path}: {str(e)}")
+    return file_report
+
 def check_files_in_directory(directory):
-    report = []
-
-    for root, _, files in os.walk(directory):
-        for file in files:
-            if file.endswith(('.md', '.mdx')):  # Check both .md and .mdx files
-                file_path = os.path.join(root, file)
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    for line_number, line in enumerate(f, 1):
-                        urls = find_urls(line)
-                        for url in urls:
-                            if is_valid_url(url):
-                                status_code, reason = check_url_status(url)
-                                # Exclude specific status codes from report
-                                if status_code and status_code not in {200, 403, 415}:
-                                    report.append({
-                                        'file': file_path,
-                                        'line': line_number,
-                                        'url': url,
-                                        'status_code': status_code,
-                                        'reason': reason
-                                    })
-    return report
+    all_reports = []
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        future_to_file = {}
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if file.endswith(('.md', '.mdx')):
+                    file_path = os.path.join(root, file)
+                    future = executor.submit(process_file, file_path)
+                    future_to_file[future] = file_path
+        for future in as_completed(future_to_file):
+            file_path = future_to_file[future]
+            try:
+                report = future.result()
+                all_reports.extend(report)
+            except Exception as exc:
+                print(f'{file_path} generated an exception: {exc}')
+    return all_reports
 
 def generate_report(report):
-    for item in report:
-        print(f"File: {item['file']}, Line: {item['line']}")
-        print(f"URL: {item['url']}")
-        print(f"Status Code: {item['status_code']}, Reason: {item['reason']}")
-        print("-" * 40)
+    output = {}
+    if report:
+        output["status"] = "issues_found"
+        output["total_issues"] = len(report)
+        output["issues"] = report
+    else:
+        output["status"] = "no_issues_found"
+        output["total_issues"] = 0
+
+    print(json.dumps(output, indent=2))
 
 if __name__ == "__main__":
-    check_path = './pages/'  # path to check
+    check_path = os.environ.get('CHECK_PATH', './pages/')
     report = check_files_in_directory(check_path)
     generate_report(report)
+
+    # Set exit code for GitHub Actions
+    exit(len(report))  # Exit code is the number of issues found