diff --git a/README.md b/README.md index 6da75f4..602a0f2 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,47 @@ -# web_scraping -A tool to recursively scrape web content +# README.md + +## Project Title + +Web Scraper + +## Introduction + +This README provides essential information for setting up and running the Web Scraper application. Before you begin, make sure you have the necessary prerequisites in place. + +## Prerequisites + +Before you can run the Web Scraper application, you need to ensure you have the following prerequisites installed on your system: + +- Python (3.10 or higher) +- pip (Python package manager) + +To install the required Python packages, navigate to the project directory and run the following command: + +```shell +pip install -r requirements.txt +``` + +Additionally, Web Scraper relies on the `wkhtmltopdf` tool for generating PDFs. If you are using Ubuntu, you can install it using the following command: + +```shell +sudo apt-get install wkhtmltopdf +``` + +## Usage + +Once you have met all the prerequisites, you can run the Web Scraper application with the following command: + +```shell +python scraper.py +``` + +Replace `` with the URL you want to use as the starting point for your scraping task. + +## Examples + +Here are a few example commands for running the Web Scraper application: + +```shell +python scraper.py https://example.com +python scraper.py https://anotherwebsite.com +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d9c1f92 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +selenium==4.14.0 +beautifulsoup4==4.12.2 +tqdm==4.66.1 +pdfkit==1.0.0 +requests==2.31.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bs.py b/src/bs.py new file mode 100644 index 0000000..ec41b36 --- /dev/null +++ b/src/bs.py @@ -0,0 +1,44 @@ +import os +import sys +import requests +import pdfkit +from bs4 import BeautifulSoup +from common.utils import url_to_str + + +def scrape_root_url(start_url, dir_name): + to_visit = [] + touched_urls = set() + to_visit.append(start_url) + touched_urls.add(start_url) + print(to_visit) + + while to_visit: + linkToVisit = to_visit.pop(0) + print(linkToVisit) + response = requests.get(linkToVisit) + source = BeautifulSoup(response.text, "html.parser") + linksFound = source.find("main").find_all("a") + for link in linksFound: + if link.has_attr("href"): + href = link["href"] + if href[0] == "#" and len(href) > 1: + new_link = linkToVisit + href + if new_link not in touched_urls: + to_visit.append(new_link) + touched_urls.add(new_link) + if href[0] == "?" and len(href) > 1: + new_link = linkToVisit + href + if new_link not in touched_urls: + to_visit.append(new_link) + touched_urls.add(new_link) + elif href[0] == "/": + new_link = start_url + href + if new_link not in touched_urls: + to_visit.append(new_link) + touched_urls.add(new_link) + + end_point = linkToVisit.removeprefix(start_url) + + pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf") + pdfkit.from_url(linkToVisit, pdf_name) diff --git a/src/common/__init__.py b/src/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/common/configurations.ini b/src/common/configurations.ini new file mode 100644 index 0000000..b3629af --- /dev/null +++ b/src/common/configurations.ini @@ -0,0 +1,4 @@ +[LOGGING] +log_file_path = +log_level = info +log_file_name = log_file.log \ No newline at end of file diff --git a/src/common/utils.py b/src/common/utils.py new file mode 100644 index 0000000..88c6366 --- /dev/null +++ b/src/common/utils.py @@ -0,0 +1,103 @@ +import os +import re +import sys +import logging +import configparser + + +# Get the parent directory of the current script +current_dir = os.path.dirname(os.path.abspath(__file__)) +g_config = None + + +def get_ConfigParser(): + global g_config + + if g_config != None: + return g_config + + g_config = configparser.ConfigParser() + g_config.read(os.path.join(current_dir, "configurations.ini")) + + return g_config + + +def get_logger(module_name): + config = get_ConfigParser() + + log_file_dir = "" + log_file_name = "" + log_level_str = "" + + if config.has_section("LOGGING"): + if config.has_option("LOGGING", "LOG_FILE_PATH"): + log_file_dir = config.get("LOGGING", "LOG_FILE_PATH") + if config.has_option("LOGGING", "LOG_FILE_NAME"): + log_file_name = config.get("LOGGING", "LOG_FILE_NAME") + if config.has_option("LOGGING", "lOG_LEVEL"): + log_level_str = config.get("LOGGING", "lOG_LEVEL") + + if log_file_dir: + if not os.path.exists(log_file_dir): + os.makedirs(log_file_dir) + else: + # Get the parent directory of the current script + log_file_dir = os.path.dirname(os.path.abspath(__file__)) + + if not log_file_name: + log_file_name = "log_file.log" + + formatter = logging.Formatter( + "%(asctime)s\t%(levelname)s\t[%(module)s:%(funcName)s]\t%(message)s", + "%Y-%m-%d %H:%M:%S", + ) + + # Define a mapping from lowercase log level strings to actual logging levels + log_level_mapping = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, + } + + # Default to INFO if the string is invalid + log_level = logging.INFO + + # Check if the lowercase log level string is a valid log level + if log_level_str in log_level_mapping: + log_level = log_level_mapping[log_level_str] + + # Create a log file handler + log_filename = os.path.join(log_file_dir, log_file_name) + file_handler = logging.FileHandler(log_filename, mode="a") + file_handler.setFormatter(formatter) + + # Create a console handler and set the formatter + channel = logging.StreamHandler(sys.stdout) + channel.setFormatter(formatter) + + logger = logging.getLogger(module_name) + logger.setLevel(log_level) + + # Add the handlers to the logger + logger.addHandler(file_handler) + logger.addHandler(channel) + + return logger + + +def create_directory(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def url_to_str(url): + # Define a regular expression pattern to match special characters + pattern = r"[\.\/\?!@#\$%^&*]" + + # Replace special characters with underscores + url = url.removeprefix("https://") + directory_name = re.sub(pattern, "_", url) + + return directory_name diff --git a/src/scraper.py b/src/scraper.py new file mode 100644 index 0000000..0e0a409 --- /dev/null +++ b/src/scraper.py @@ -0,0 +1,26 @@ +import sys + +from common.utils import url_to_str +from common.utils import create_directory +from bs import scrape_root_url + + +def main(): + # Access command-line arguments using sys.argv + # sys.argv[0] is the script name, and subsequent elements are arguments + if len(sys.argv) < 2: + print("Usage: python script.py ") + return + + start_url = sys.argv[1] + + print(f"Start url: {start_url}") + + dir_name = url_to_str(start_url) + create_directory(dir_name) + + scrape_root_url(start_url, dir_name) + + +if __name__ == "__main__": + main()