First cut of Web Scraper App

JMkrish · Oct 17, 2023 · 099549e · 099549e
1 parent 4e778b8
commit 099549e
Show file tree

Hide file tree

Showing 8 changed files with 229 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,47 @@
-# web_scraping
-A tool to recursively scrape web content
+# README.md
+
+## Project Title
+
+Web Scraper
+
+## Introduction
+
+This README provides essential information for setting up and running the Web Scraper application. Before you begin, make sure you have the necessary prerequisites in place.
+
+## Prerequisites
+
+Before you can run the Web Scraper application, you need to ensure you have the following prerequisites installed on your system:
+
+- Python (3.10 or higher)
+- pip (Python package manager)
+
+To install the required Python packages, navigate to the project directory and run the following command:
+
+```shell
+pip install -r requirements.txt
+```
+
+Additionally, Web Scraper relies on the `wkhtmltopdf` tool for generating PDFs. If you are using Ubuntu, you can install it using the following command:
+
+```shell
+sudo apt-get install wkhtmltopdf
+```
+
+## Usage
+
+Once you have met all the prerequisites, you can run the Web Scraper application with the following command:
+
+```shell
+python scraper.py <base url>
+```
+
+Replace `<base url>` with the URL you want to use as the starting point for your scraping task.
+
+## Examples
+
+Here are a few example commands for running the Web Scraper application:
+
+```shell
+python scraper.py https://example.com
+python scraper.py https://anotherwebsite.com
+```
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+selenium==4.14.0
+beautifulsoup4==4.12.2
+tqdm==4.66.1
+pdfkit==1.0.0
+requests==2.31.0
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/bs.py b/src/bs.py
@@ -0,0 +1,44 @@
+import os
+import sys
+import requests
+import pdfkit
+from bs4 import BeautifulSoup
+from common.utils import url_to_str
+
+
+def scrape_root_url(start_url, dir_name):
+    to_visit = []
+    touched_urls = set()
+    to_visit.append(start_url)
+    touched_urls.add(start_url)
+    print(to_visit)
+
+    while to_visit:
+        linkToVisit = to_visit.pop(0)
+        print(linkToVisit)
+        response = requests.get(linkToVisit)
+        source = BeautifulSoup(response.text, "html.parser")
+        linksFound = source.find("main").find_all("a")
+        for link in linksFound:
+            if link.has_attr("href"):
+                href = link["href"]
+                if href[0] == "#" and len(href) > 1:
+                    new_link = linkToVisit + href
+                    if new_link not in touched_urls:
+                        to_visit.append(new_link)
+                        touched_urls.add(new_link)
+                if href[0] == "?" and len(href) > 1:
+                    new_link = linkToVisit + href
+                    if new_link not in touched_urls:
+                        to_visit.append(new_link)
+                        touched_urls.add(new_link)
+                elif href[0] == "/":
+                    new_link = start_url + href
+                    if new_link not in touched_urls:
+                        to_visit.append(new_link)
+                        touched_urls.add(new_link)
+
+        end_point = linkToVisit.removeprefix(start_url)
+
+        pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
+        pdfkit.from_url(linkToVisit, pdf_name)
diff --git a/src/common/__init__.py b/src/common/__init__.py
diff --git a/src/common/configurations.ini b/src/common/configurations.ini
@@ -0,0 +1,4 @@
+[LOGGING]
+log_file_path = 
+log_level = info
+log_file_name = log_file.log
diff --git a/src/common/utils.py b/src/common/utils.py
@@ -0,0 +1,103 @@
+import os
+import re
+import sys
+import logging
+import configparser
+
+
+# Get the parent directory of the current script
+current_dir = os.path.dirname(os.path.abspath(__file__))
+g_config = None
+
+
+def get_ConfigParser():
+    global g_config
+
+    if g_config != None:
+        return g_config
+
+    g_config = configparser.ConfigParser()
+    g_config.read(os.path.join(current_dir, "configurations.ini"))
+
+    return g_config
+
+
+def get_logger(module_name):
+    config = get_ConfigParser()
+
+    log_file_dir = ""
+    log_file_name = ""
+    log_level_str = ""
+
+    if config.has_section("LOGGING"):
+        if config.has_option("LOGGING", "LOG_FILE_PATH"):
+            log_file_dir = config.get("LOGGING", "LOG_FILE_PATH")
+        if config.has_option("LOGGING", "LOG_FILE_NAME"):
+            log_file_name = config.get("LOGGING", "LOG_FILE_NAME")
+        if config.has_option("LOGGING", "lOG_LEVEL"):
+            log_level_str = config.get("LOGGING", "lOG_LEVEL")
+
+    if log_file_dir:
+        if not os.path.exists(log_file_dir):
+            os.makedirs(log_file_dir)
+    else:
+        # Get the parent directory of the current script
+        log_file_dir = os.path.dirname(os.path.abspath(__file__))
+
+    if not log_file_name:
+        log_file_name = "log_file.log"
+
+    formatter = logging.Formatter(
+        "%(asctime)s\t%(levelname)s\t[%(module)s:%(funcName)s]\t%(message)s",
+        "%Y-%m-%d %H:%M:%S",
+    )
+
+    # Define a mapping from lowercase log level strings to actual logging levels
+    log_level_mapping = {
+        "debug": logging.DEBUG,
+        "info": logging.INFO,
+        "warning": logging.WARNING,
+        "error": logging.ERROR,
+        "critical": logging.CRITICAL,
+    }
+
+    # Default to INFO if the string is invalid
+    log_level = logging.INFO
+
+    # Check if the lowercase log level string is a valid log level
+    if log_level_str in log_level_mapping:
+        log_level = log_level_mapping[log_level_str]
+
+    # Create a log file handler
+    log_filename = os.path.join(log_file_dir, log_file_name)
+    file_handler = logging.FileHandler(log_filename, mode="a")
+    file_handler.setFormatter(formatter)
+
+    # Create a console handler and set the formatter
+    channel = logging.StreamHandler(sys.stdout)
+    channel.setFormatter(formatter)
+
+    logger = logging.getLogger(module_name)
+    logger.setLevel(log_level)
+
+    # Add the handlers to the logger
+    logger.addHandler(file_handler)
+    logger.addHandler(channel)
+
+    return logger
+
+
+def create_directory(directory):
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+
+def url_to_str(url):
+    # Define a regular expression pattern to match special characters
+    pattern = r"[\.\/\?!@#\$%^&*]"
+
+    # Replace special characters with underscores
+    url = url.removeprefix("https://")
+    directory_name = re.sub(pattern, "_", url)
+
+    return directory_name
diff --git a/src/scraper.py b/src/scraper.py
@@ -0,0 +1,26 @@
+import sys
+
+from common.utils import url_to_str
+from common.utils import create_directory
+from bs import scrape_root_url
+
+
+def main():
+    # Access command-line arguments using sys.argv
+    # sys.argv[0] is the script name, and subsequent elements are arguments
+    if len(sys.argv) < 2:
+        print("Usage: python script.py <base_url>")
+        return
+
+    start_url = sys.argv[1]
+
+    print(f"Start url: {start_url}")
+
+    dir_name = url_to_str(start_url)
+    create_directory(dir_name)
+
+    scrape_root_url(start_url, dir_name)
+
+
+if __name__ == "__main__":
+    main()