forked from Siva-Venigalla/web_scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
229 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,47 @@ | ||
# web_scraping | ||
A tool to recursively scrape web content | ||
# README.md | ||
|
||
## Project Title | ||
|
||
Web Scraper | ||
|
||
## Introduction | ||
|
||
This README provides essential information for setting up and running the Web Scraper application. Before you begin, make sure you have the necessary prerequisites in place. | ||
|
||
## Prerequisites | ||
|
||
Before you can run the Web Scraper application, you need to ensure you have the following prerequisites installed on your system: | ||
|
||
- Python (3.10 or higher) | ||
- pip (Python package manager) | ||
|
||
To install the required Python packages, navigate to the project directory and run the following command: | ||
|
||
```shell | ||
pip install -r requirements.txt | ||
``` | ||
|
||
Additionally, Web Scraper relies on the `wkhtmltopdf` tool for generating PDFs. If you are using Ubuntu, you can install it using the following command: | ||
|
||
```shell | ||
sudo apt-get install wkhtmltopdf | ||
``` | ||
|
||
## Usage | ||
|
||
Once you have met all the prerequisites, you can run the Web Scraper application with the following command: | ||
|
||
```shell | ||
python scraper.py <base url> | ||
``` | ||
|
||
Replace `<base url>` with the URL you want to use as the starting point for your scraping task. | ||
|
||
## Examples | ||
|
||
Here are a few example commands for running the Web Scraper application: | ||
|
||
```shell | ||
python scraper.py https://example.com | ||
python scraper.py https://anotherwebsite.com | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
selenium==4.14.0 | ||
beautifulsoup4==4.12.2 | ||
tqdm==4.66.1 | ||
pdfkit==1.0.0 | ||
requests==2.31.0 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import os | ||
import sys | ||
import requests | ||
import pdfkit | ||
from bs4 import BeautifulSoup | ||
from common.utils import url_to_str | ||
|
||
|
||
def scrape_root_url(start_url, dir_name): | ||
to_visit = [] | ||
touched_urls = set() | ||
to_visit.append(start_url) | ||
touched_urls.add(start_url) | ||
print(to_visit) | ||
|
||
while to_visit: | ||
linkToVisit = to_visit.pop(0) | ||
print(linkToVisit) | ||
response = requests.get(linkToVisit) | ||
source = BeautifulSoup(response.text, "html.parser") | ||
linksFound = source.find("main").find_all("a") | ||
for link in linksFound: | ||
if link.has_attr("href"): | ||
href = link["href"] | ||
if href[0] == "#" and len(href) > 1: | ||
new_link = linkToVisit + href | ||
if new_link not in touched_urls: | ||
to_visit.append(new_link) | ||
touched_urls.add(new_link) | ||
if href[0] == "?" and len(href) > 1: | ||
new_link = linkToVisit + href | ||
if new_link not in touched_urls: | ||
to_visit.append(new_link) | ||
touched_urls.add(new_link) | ||
elif href[0] == "/": | ||
new_link = start_url + href | ||
if new_link not in touched_urls: | ||
to_visit.append(new_link) | ||
touched_urls.add(new_link) | ||
|
||
end_point = linkToVisit.removeprefix(start_url) | ||
|
||
pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf") | ||
pdfkit.from_url(linkToVisit, pdf_name) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[LOGGING] | ||
log_file_path = | ||
log_level = info | ||
log_file_name = log_file.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import os | ||
import re | ||
import sys | ||
import logging | ||
import configparser | ||
|
||
|
||
# Get the parent directory of the current script | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
g_config = None | ||
|
||
|
||
def get_ConfigParser(): | ||
global g_config | ||
|
||
if g_config != None: | ||
return g_config | ||
|
||
g_config = configparser.ConfigParser() | ||
g_config.read(os.path.join(current_dir, "configurations.ini")) | ||
|
||
return g_config | ||
|
||
|
||
def get_logger(module_name): | ||
config = get_ConfigParser() | ||
|
||
log_file_dir = "" | ||
log_file_name = "" | ||
log_level_str = "" | ||
|
||
if config.has_section("LOGGING"): | ||
if config.has_option("LOGGING", "LOG_FILE_PATH"): | ||
log_file_dir = config.get("LOGGING", "LOG_FILE_PATH") | ||
if config.has_option("LOGGING", "LOG_FILE_NAME"): | ||
log_file_name = config.get("LOGGING", "LOG_FILE_NAME") | ||
if config.has_option("LOGGING", "lOG_LEVEL"): | ||
log_level_str = config.get("LOGGING", "lOG_LEVEL") | ||
|
||
if log_file_dir: | ||
if not os.path.exists(log_file_dir): | ||
os.makedirs(log_file_dir) | ||
else: | ||
# Get the parent directory of the current script | ||
log_file_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
||
if not log_file_name: | ||
log_file_name = "log_file.log" | ||
|
||
formatter = logging.Formatter( | ||
"%(asctime)s\t%(levelname)s\t[%(module)s:%(funcName)s]\t%(message)s", | ||
"%Y-%m-%d %H:%M:%S", | ||
) | ||
|
||
# Define a mapping from lowercase log level strings to actual logging levels | ||
log_level_mapping = { | ||
"debug": logging.DEBUG, | ||
"info": logging.INFO, | ||
"warning": logging.WARNING, | ||
"error": logging.ERROR, | ||
"critical": logging.CRITICAL, | ||
} | ||
|
||
# Default to INFO if the string is invalid | ||
log_level = logging.INFO | ||
|
||
# Check if the lowercase log level string is a valid log level | ||
if log_level_str in log_level_mapping: | ||
log_level = log_level_mapping[log_level_str] | ||
|
||
# Create a log file handler | ||
log_filename = os.path.join(log_file_dir, log_file_name) | ||
file_handler = logging.FileHandler(log_filename, mode="a") | ||
file_handler.setFormatter(formatter) | ||
|
||
# Create a console handler and set the formatter | ||
channel = logging.StreamHandler(sys.stdout) | ||
channel.setFormatter(formatter) | ||
|
||
logger = logging.getLogger(module_name) | ||
logger.setLevel(log_level) | ||
|
||
# Add the handlers to the logger | ||
logger.addHandler(file_handler) | ||
logger.addHandler(channel) | ||
|
||
return logger | ||
|
||
|
||
def create_directory(directory): | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
|
||
|
||
def url_to_str(url): | ||
# Define a regular expression pattern to match special characters | ||
pattern = r"[\.\/\?!@#\$%^&*]" | ||
|
||
# Replace special characters with underscores | ||
url = url.removeprefix("https://") | ||
directory_name = re.sub(pattern, "_", url) | ||
|
||
return directory_name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import sys | ||
|
||
from common.utils import url_to_str | ||
from common.utils import create_directory | ||
from bs import scrape_root_url | ||
|
||
|
||
def main(): | ||
# Access command-line arguments using sys.argv | ||
# sys.argv[0] is the script name, and subsequent elements are arguments | ||
if len(sys.argv) < 2: | ||
print("Usage: python script.py <base_url>") | ||
return | ||
|
||
start_url = sys.argv[1] | ||
|
||
print(f"Start url: {start_url}") | ||
|
||
dir_name = url_to_str(start_url) | ||
create_directory(dir_name) | ||
|
||
scrape_root_url(start_url, dir_name) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |