Skip to content

Commit

Permalink
fix: disallow mailto: (#861)
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed Jan 6, 2025
1 parent 5be7c49 commit 8d9c909
Showing 1 changed file with 31 additions and 5 deletions.
36 changes: 31 additions & 5 deletions scrapegraphai/nodes/fetch_node_level_k.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,20 +160,42 @@ def extract_links(self, html_content: str) -> list:
def get_full_links(self, base_url: str, links: list) -> list:
"""
Converts relative URLs to full URLs based on the base URL.
Filters out non-web links (mailto:, tel:, javascript:, etc.).
Args:
base_url (str): The base URL for resolving relative links.
links (list): A list of links to convert.
Returns:
list: A list of full URLs.
list: A list of valid full URLs.
"""
# List of invalid URL schemes to filter out
invalid_schemes = {
'mailto:', 'tel:', 'fax:', 'sms:', 'callto:', 'wtai:', 'javascript:',
'data:', 'file:', 'ftp:', 'irc:', 'news:', 'nntp:', 'feed:', 'webcal:',
'skype:', 'im:', 'mtps:', 'spotify:', 'steam:', 'teamspeak:', 'udp:',
'unreal:', 'ut2004:', 'ventrilo:', 'view-source:', 'ws:', 'wss:'
}

full_links = []
for link in links:
if self.only_inside_links and link.startswith("http"):
# Skip if link starts with any invalid scheme
if any(link.lower().startswith(scheme) for scheme in invalid_schemes):
continue
full_link = link if link.startswith("http") else urljoin(base_url, link)
full_links.append(full_link)

# Skip if it's an external link and only_inside_links is True
if self.only_inside_links and link.startswith(('http://', 'https://')):
continue

# Convert relative URLs to absolute URLs
try:
full_link = link if link.startswith(('http://', 'https://')) else urljoin(base_url, link)
# Ensure the final URL starts with http:// or https://
if full_link.startswith(('http://', 'https://')):
full_links.append(full_link)
except Exception as e:
self.logger.warning(f"Failed to process link {link}: {str(e)}")

return full_links

def obtain_content(self, documents: List, loader_kwargs) -> List:
Expand All @@ -191,7 +213,11 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
for doc in documents:
source = doc["source"]
if "document" not in doc:
document = self.fetch_content(source, loader_kwargs)
try:
document = self.fetch_content(source, loader_kwargs)
except Exception as e:
self.logger.warning(f"Failed to fetch content for {source}: {str(e)}")
continue

if not document or not document[0].page_content.strip():
self.logger.warning(f"Failed to fetch content for {source}")
Expand Down

0 comments on commit 8d9c909

Please sign in to comment.