Skip to content

Commit

Permalink
First cut of Web Scraper App
Browse files Browse the repository at this point in the history
  • Loading branch information
JMkrish committed Oct 17, 2023
1 parent 4e778b8 commit 099549e
Show file tree
Hide file tree
Showing 8 changed files with 229 additions and 2 deletions.
49 changes: 47 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,47 @@
# web_scraping
A tool to recursively scrape web content
# README.md

## Project Title

Web Scraper

## Introduction

This README provides essential information for setting up and running the Web Scraper application. Before you begin, make sure you have the necessary prerequisites in place.

## Prerequisites

Before you can run the Web Scraper application, you need to ensure you have the following prerequisites installed on your system:

- Python (3.10 or higher)
- pip (Python package manager)

To install the required Python packages, navigate to the project directory and run the following command:

```shell
pip install -r requirements.txt
```

Additionally, Web Scraper relies on the `wkhtmltopdf` tool for generating PDFs. If you are using Ubuntu, you can install it using the following command:

```shell
sudo apt-get install wkhtmltopdf
```

## Usage

Once you have met all the prerequisites, you can run the Web Scraper application with the following command:

```shell
python scraper.py <base url>
```

Replace `<base url>` with the URL you want to use as the starting point for your scraping task.

## Examples

Here are a few example commands for running the Web Scraper application:

```shell
python scraper.py https://example.com
python scraper.py https://anotherwebsite.com
```
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
selenium==4.14.0
beautifulsoup4==4.12.2
tqdm==4.66.1
pdfkit==1.0.0
requests==2.31.0
Empty file added src/__init__.py
Empty file.
44 changes: 44 additions & 0 deletions src/bs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import sys
import requests
import pdfkit
from bs4 import BeautifulSoup
from common.utils import url_to_str


def scrape_root_url(start_url, dir_name):
to_visit = []
touched_urls = set()
to_visit.append(start_url)
touched_urls.add(start_url)
print(to_visit)

while to_visit:
linkToVisit = to_visit.pop(0)
print(linkToVisit)
response = requests.get(linkToVisit)
source = BeautifulSoup(response.text, "html.parser")
linksFound = source.find("main").find_all("a")
for link in linksFound:
if link.has_attr("href"):
href = link["href"]
if href[0] == "#" and len(href) > 1:
new_link = linkToVisit + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
if href[0] == "?" and len(href) > 1:
new_link = linkToVisit + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
elif href[0] == "/":
new_link = start_url + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)

end_point = linkToVisit.removeprefix(start_url)

pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
pdfkit.from_url(linkToVisit, pdf_name)
Empty file added src/common/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions src/common/configurations.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[LOGGING]
log_file_path =
log_level = info
log_file_name = log_file.log
103 changes: 103 additions & 0 deletions src/common/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import re
import sys
import logging
import configparser


# Get the parent directory of the current script
current_dir = os.path.dirname(os.path.abspath(__file__))
g_config = None


def get_ConfigParser():
global g_config

if g_config != None:
return g_config

g_config = configparser.ConfigParser()
g_config.read(os.path.join(current_dir, "configurations.ini"))

return g_config


def get_logger(module_name):
config = get_ConfigParser()

log_file_dir = ""
log_file_name = ""
log_level_str = ""

if config.has_section("LOGGING"):
if config.has_option("LOGGING", "LOG_FILE_PATH"):
log_file_dir = config.get("LOGGING", "LOG_FILE_PATH")
if config.has_option("LOGGING", "LOG_FILE_NAME"):
log_file_name = config.get("LOGGING", "LOG_FILE_NAME")
if config.has_option("LOGGING", "lOG_LEVEL"):
log_level_str = config.get("LOGGING", "lOG_LEVEL")

if log_file_dir:
if not os.path.exists(log_file_dir):
os.makedirs(log_file_dir)
else:
# Get the parent directory of the current script
log_file_dir = os.path.dirname(os.path.abspath(__file__))

if not log_file_name:
log_file_name = "log_file.log"

formatter = logging.Formatter(
"%(asctime)s\t%(levelname)s\t[%(module)s:%(funcName)s]\t%(message)s",
"%Y-%m-%d %H:%M:%S",
)

# Define a mapping from lowercase log level strings to actual logging levels
log_level_mapping = {
"debug": logging.DEBUG,
"info": logging.INFO,
"warning": logging.WARNING,
"error": logging.ERROR,
"critical": logging.CRITICAL,
}

# Default to INFO if the string is invalid
log_level = logging.INFO

# Check if the lowercase log level string is a valid log level
if log_level_str in log_level_mapping:
log_level = log_level_mapping[log_level_str]

# Create a log file handler
log_filename = os.path.join(log_file_dir, log_file_name)
file_handler = logging.FileHandler(log_filename, mode="a")
file_handler.setFormatter(formatter)

# Create a console handler and set the formatter
channel = logging.StreamHandler(sys.stdout)
channel.setFormatter(formatter)

logger = logging.getLogger(module_name)
logger.setLevel(log_level)

# Add the handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(channel)

return logger


def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)


def url_to_str(url):
# Define a regular expression pattern to match special characters
pattern = r"[\.\/\?!@#\$%^&*]"

# Replace special characters with underscores
url = url.removeprefix("https://")
directory_name = re.sub(pattern, "_", url)

return directory_name
26 changes: 26 additions & 0 deletions src/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys

from common.utils import url_to_str
from common.utils import create_directory
from bs import scrape_root_url


def main():
# Access command-line arguments using sys.argv
# sys.argv[0] is the script name, and subsequent elements are arguments
if len(sys.argv) < 2:
print("Usage: python script.py <base_url>")
return

start_url = sys.argv[1]

print(f"Start url: {start_url}")

dir_name = url_to_str(start_url)
create_directory(dir_name)

scrape_root_url(start_url, dir_name)


if __name__ == "__main__":
main()

0 comments on commit 099549e

Please sign in to comment.