diff --git a/README.md b/README.md
index 6da75f4..602a0f2 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,47 @@
-# web_scraping
-A tool to recursively scrape web content
+# README.md
+
+## Project Title
+
+Web Scraper
+
+## Introduction
+
+This README provides essential information for setting up and running the Web Scraper application. Before you begin, make sure you have the necessary prerequisites in place.
+
+## Prerequisites
+
+Before you can run the Web Scraper application, you need to ensure you have the following prerequisites installed on your system:
+
+- Python (3.10 or higher)
+- pip (Python package manager)
+
+To install the required Python packages, navigate to the project directory and run the following command:
+
+```shell
+pip install -r requirements.txt
+```
+
+Additionally, Web Scraper relies on the `wkhtmltopdf` tool for generating PDFs. If you are using Ubuntu, you can install it using the following command:
+
+```shell
+sudo apt-get install wkhtmltopdf
+```
+
+## Usage
+
+Once you have met all the prerequisites, you can run the Web Scraper application with the following command:
+
+```shell
+python scraper.py
+```
+
+Replace `` with the URL you want to use as the starting point for your scraping task.
+
+## Examples
+
+Here are a few example commands for running the Web Scraper application:
+
+```shell
+python scraper.py https://example.com
+python scraper.py https://anotherwebsite.com
+```
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d9c1f92
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+selenium==4.14.0
+beautifulsoup4==4.12.2
+tqdm==4.66.1
+pdfkit==1.0.0
+requests==2.31.0
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/bs.py b/src/bs.py
new file mode 100644
index 0000000..ec41b36
--- /dev/null
+++ b/src/bs.py
@@ -0,0 +1,44 @@
+import os
+import sys
+import requests
+import pdfkit
+from bs4 import BeautifulSoup
+from common.utils import url_to_str
+
+
+def scrape_root_url(start_url, dir_name):
+ to_visit = []
+ touched_urls = set()
+ to_visit.append(start_url)
+ touched_urls.add(start_url)
+ print(to_visit)
+
+ while to_visit:
+ linkToVisit = to_visit.pop(0)
+ print(linkToVisit)
+ response = requests.get(linkToVisit)
+ source = BeautifulSoup(response.text, "html.parser")
+ linksFound = source.find("main").find_all("a")
+ for link in linksFound:
+ if link.has_attr("href"):
+ href = link["href"]
+ if href[0] == "#" and len(href) > 1:
+ new_link = linkToVisit + href
+ if new_link not in touched_urls:
+ to_visit.append(new_link)
+ touched_urls.add(new_link)
+ if href[0] == "?" and len(href) > 1:
+ new_link = linkToVisit + href
+ if new_link not in touched_urls:
+ to_visit.append(new_link)
+ touched_urls.add(new_link)
+ elif href[0] == "/":
+ new_link = start_url + href
+ if new_link not in touched_urls:
+ to_visit.append(new_link)
+ touched_urls.add(new_link)
+
+ end_point = linkToVisit.removeprefix(start_url)
+
+ pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
+ pdfkit.from_url(linkToVisit, pdf_name)
diff --git a/src/common/__init__.py b/src/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/common/configurations.ini b/src/common/configurations.ini
new file mode 100644
index 0000000..b3629af
--- /dev/null
+++ b/src/common/configurations.ini
@@ -0,0 +1,4 @@
+[LOGGING]
+log_file_path =
+log_level = info
+log_file_name = log_file.log
\ No newline at end of file
diff --git a/src/common/utils.py b/src/common/utils.py
new file mode 100644
index 0000000..88c6366
--- /dev/null
+++ b/src/common/utils.py
@@ -0,0 +1,103 @@
+import os
+import re
+import sys
+import logging
+import configparser
+
+
+# Get the parent directory of the current script
+current_dir = os.path.dirname(os.path.abspath(__file__))
+g_config = None
+
+
+def get_ConfigParser():
+ global g_config
+
+ if g_config != None:
+ return g_config
+
+ g_config = configparser.ConfigParser()
+ g_config.read(os.path.join(current_dir, "configurations.ini"))
+
+ return g_config
+
+
+def get_logger(module_name):
+ config = get_ConfigParser()
+
+ log_file_dir = ""
+ log_file_name = ""
+ log_level_str = ""
+
+ if config.has_section("LOGGING"):
+ if config.has_option("LOGGING", "LOG_FILE_PATH"):
+ log_file_dir = config.get("LOGGING", "LOG_FILE_PATH")
+ if config.has_option("LOGGING", "LOG_FILE_NAME"):
+ log_file_name = config.get("LOGGING", "LOG_FILE_NAME")
+ if config.has_option("LOGGING", "lOG_LEVEL"):
+ log_level_str = config.get("LOGGING", "lOG_LEVEL")
+
+ if log_file_dir:
+ if not os.path.exists(log_file_dir):
+ os.makedirs(log_file_dir)
+ else:
+ # Get the parent directory of the current script
+ log_file_dir = os.path.dirname(os.path.abspath(__file__))
+
+ if not log_file_name:
+ log_file_name = "log_file.log"
+
+ formatter = logging.Formatter(
+ "%(asctime)s\t%(levelname)s\t[%(module)s:%(funcName)s]\t%(message)s",
+ "%Y-%m-%d %H:%M:%S",
+ )
+
+ # Define a mapping from lowercase log level strings to actual logging levels
+ log_level_mapping = {
+ "debug": logging.DEBUG,
+ "info": logging.INFO,
+ "warning": logging.WARNING,
+ "error": logging.ERROR,
+ "critical": logging.CRITICAL,
+ }
+
+ # Default to INFO if the string is invalid
+ log_level = logging.INFO
+
+ # Check if the lowercase log level string is a valid log level
+ if log_level_str in log_level_mapping:
+ log_level = log_level_mapping[log_level_str]
+
+ # Create a log file handler
+ log_filename = os.path.join(log_file_dir, log_file_name)
+ file_handler = logging.FileHandler(log_filename, mode="a")
+ file_handler.setFormatter(formatter)
+
+ # Create a console handler and set the formatter
+ channel = logging.StreamHandler(sys.stdout)
+ channel.setFormatter(formatter)
+
+ logger = logging.getLogger(module_name)
+ logger.setLevel(log_level)
+
+ # Add the handlers to the logger
+ logger.addHandler(file_handler)
+ logger.addHandler(channel)
+
+ return logger
+
+
+def create_directory(directory):
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+
+
+def url_to_str(url):
+ # Define a regular expression pattern to match special characters
+ pattern = r"[\.\/\?!@#\$%^&*]"
+
+ # Replace special characters with underscores
+ url = url.removeprefix("https://")
+ directory_name = re.sub(pattern, "_", url)
+
+ return directory_name
diff --git a/src/scraper.py b/src/scraper.py
new file mode 100644
index 0000000..0e0a409
--- /dev/null
+++ b/src/scraper.py
@@ -0,0 +1,26 @@
+import sys
+
+from common.utils import url_to_str
+from common.utils import create_directory
+from bs import scrape_root_url
+
+
+def main():
+ # Access command-line arguments using sys.argv
+ # sys.argv[0] is the script name, and subsequent elements are arguments
+ if len(sys.argv) < 2:
+ print("Usage: python script.py ")
+ return
+
+ start_url = sys.argv[1]
+
+ print(f"Start url: {start_url}")
+
+ dir_name = url_to_str(start_url)
+ create_directory(dir_name)
+
+ scrape_root_url(start_url, dir_name)
+
+
+if __name__ == "__main__":
+ main()