Skip to content

Commit

Permalink
feat: serper api search
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed Jan 6, 2025
1 parent 4380afb commit 1c0141f
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 56 deletions.
11 changes: 8 additions & 3 deletions scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,17 @@ def __init__(
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
self.search_engine = (
node_config["search_engine"]
if node_config.get("search_engine")
else "google"
)

self.serper_api_key = (
node_config["serper_api_key"] if node_config.get("serper_api_key") else None
)

self.max_results = node_config.get("max_results", 3)

def execute(self, state: dict) -> dict:
Expand Down Expand Up @@ -100,13 +106,12 @@ def execute(self, state: dict) -> dict:
query=search_query,
max_results=self.max_results,
search_engine=self.search_engine,
proxy=self.proxy,
serper_api_key=self.serper_api_key,
)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")

# Store both the URLs and considered_urls in the state
state.update({self.output[0]: answer})
state["considered_urls"] = answer # Add this as a backup

return state
191 changes: 138 additions & 53 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Research_web module
research_web module
"""

import re
Expand All @@ -12,68 +12,153 @@


def search_on_web(
query: str, search_engine: str = "Google", max_results: int = 10, port: int = 8080
query: str,
search_engine: str = "Google",
max_results: int = 10,
port: int = 8080,
timeout: int = 10,
proxy: str | dict = None,
serper_api_key: str = None,
) -> List[str]:
"""
Searches the web for a given query using specified search engine options.
"""Search web function with improved error handling and validation"""

Args:
query (str): The search query to find on the internet.
search_engine (str, optional): Specifies the search engine to use,
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'.
max_results (int, optional): The maximum number of search results to return.
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
# Input validation
if not query or not isinstance(query, str):
raise ValueError("Query must be a non-empty string")

Returns:
List[str]: A list of URLs as strings that are the search results.
search_engine = search_engine.lower()
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"}
if search_engine not in valid_engines:
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}")

Raises:
ValueError: If the search engine specified is not supported.
# Format proxy once
formatted_proxy = None
if proxy:
formatted_proxy = format_proxy(proxy)

Example:
>>> search_on_web("example query", search_engine="Google", max_results=5)
['http://example.com', 'http://example.org', ...]
"""
try:
results = []
if search_engine == "google":
results = list(
google_search(query, num_results=max_results, proxy=formatted_proxy)
)

elif search_engine == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
results = re.findall(r"https?://[^\s,\]]+", res)

elif search_engine == "bing":
results = _search_bing(query, max_results, timeout, formatted_proxy)

elif search_engine == "searxng":
results = _search_searxng(query, max_results, port, timeout)

elif search_engine.lower() == "serper":
results = _search_serper(query, max_results, serper_api_key, timeout)

return filter_pdf_links(results)

except requests.Timeout:
raise TimeoutError(f"Search request timed out after {timeout} seconds")
except requests.RequestException as e:
raise RuntimeError(f"Search request failed: {str(e)}")


def _search_bing(
query: str, max_results: int, timeout: int, proxy: str = None
) -> List[str]:
"""Helper function for Bing search"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
search_url = f"https://www.bing.com/search?q={query}"

proxies = {"http": proxy, "https": proxy} if proxy else None
response = requests.get(
search_url, headers=headers, timeout=timeout, proxies=proxies
)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")
return [
result.find("a")["href"]
for result in soup.find_all("li", class_="b_algo", limit=max_results)
]


def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]:
"""Helper function for SearXNG search"""
url = f"http://localhost:{port}/search"
params = {
"q": query,
"format": "json",
"engines": "google,duckduckgo,brave,qwant,bing",
}
response = requests.get(url, params=params, timeout=timeout)
response.raise_for_status()
return [
result["url"] for result in response.json().get("results", [])[:max_results]
]


def _search_serper(
query: str, max_results: int, serper_api_key: str, timeout: int
) -> List[str]:
"""Helper function for Serper API to get Google search results"""
if not serper_api_key:
raise ValueError("API key is required for Serper API")

if search_engine.lower() == "google":
res = []
for url in google_search(query, num_results=max_results):
res.append(url)
return res

elif search_engine.lower() == "duckduckgo":
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
links = re.findall(r"https?://[^\s,\]]+", res)
return links[:max_results]

elif search_engine.lower() == "bing":
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
search_url = f"https://www.bing.com/search?q={query}"
response = requests.get(search_url, headers=headers)
url = "https://google.serper.dev/search"
payload = {"q": query, "num": max_results}

headers = {"X-API-KEY": serper_api_key, "Content-Type": "application/json"}

try:
response = requests.post(
url,
headers=headers,
json=payload, # requests will handle JSON serialization
timeout=timeout,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

search_results = []
for result in soup.find_all("li", class_="b_algo", limit=max_results):
link = result.find("a")["href"]
search_results.append(link)
return search_results
# Extract only the organic search results
results = response.json()
organic_results = results.get("organic", [])
urls = [result.get("link") for result in organic_results if result.get("link")]

return urls[:max_results]

elif search_engine.lower() == "searxng":
url = f"http://localhost:{port}"
params = {"q": query, "format": "json"}
except requests.exceptions.RequestException as e:
raise RuntimeError(f"Serper API request failed: {str(e)}")

# Send the GET request to the server
response = requests.get(url, params=params)

data = response.json()
limited_results = [result["url"] for result in data["results"][:max_results]]
return limited_results
def format_proxy(proxy):
if isinstance(proxy, dict):
server = proxy.get("server")
username = proxy.get("username")
password = proxy.get("password")

if all([username, password, server]):
proxy_url = f"http://{username}:{password}@{server}"
return proxy_url
else:
raise ValueError("Proxy dictionary is missing required fields.")
elif isinstance(proxy, str):
return proxy # "https://username:password@ip:port"
else:
raise ValueError(
"The only search engines available are DuckDuckGo, Google, Bing, or SearXNG"
)
raise TypeError("Proxy should be a dictionary or a string.")


def filter_pdf_links(links: List[str]) -> List[str]:
"""
Filters out any links that point to PDF files.
Args:
links (List[str]): A list of URLs as strings.
Returns:
List[str]: A list of URLs excluding any that end with '.pdf'.
"""
return [link for link in links if not link.lower().endswith(".pdf")]

0 comments on commit 1c0141f

Please sign in to comment.