-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
136 lines (110 loc) · 4.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from fastapi import FastAPI, HTTPException
from typing import Optional
import requests
import cloudscraper
from bs4 import BeautifulSoup
from fastapi.responses import JSONResponse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
import time
import random
app = FastAPI()
# Configurações Globais
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
REQUEST_TIMEOUT = 25
SELENIUM_TIMEOUT = 45
BASE_URL = "https://adnews.com.br"
@app.get("/scrape")
async def scrape(url: Optional[str] = None):
"""Endpoint principal para scraping"""
if not url:
raise HTTPException(status_code=400, detail="URL é obrigatório")
if "adnews.com.br" not in url:
raise HTTPException(status_code=400, detail="Domínio não permitido")
# Tentativa 1: Requests + Cloudscraper
results = scrape_adnews_requests(url)
# Tentativa 2: Fallback para Selenium
if not results:
results = scrape_adnews_selenium(url)
return JSONResponse(content=results or [], status_code=200)
def scrape_adnews_requests(url: str):
"""Método tradicional com tratamento de anti-bot"""
try:
headers = {"User-Agent": USER_AGENT}
response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
# Se bloqueado, usar Cloudscraper
if response.status_code in [403, 429, 503]:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, timeout=REQUEST_TIMEOUT*2)
return parse_articles(BeautifulSoup(response.text, 'html.parser')) if response.ok else []
except Exception as e:
print(f"Erro Requests: {str(e)}")
return []
def scrape_adnews_selenium(url: str):
"""Método Selenium com configurações anti-detecção"""
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--headless=new")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(f"user-agent={USER_AGENT}")
options.binary_location = "/usr/bin/chromium"
# Configurações avançadas
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
service = Service(executable_path="/usr/bin/chromedriver")
driver = None
try:
driver = webdriver.Chrome(service=service, options=options)
driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": USER_AGENT})
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
})
driver.set_page_load_timeout(SELENIUM_TIMEOUT)
driver.get(url)
# Espera inteligente + Scroll
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "article.result"))
)
for _ in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(random.uniform(1.2, 2.5))
return parse_articles(BeautifulSoup(driver.page_source, 'html.parser'))
except Exception as e:
print(f"Erro Selenium: {str(e)}")
return []
finally:
if driver:
driver.quit()
def parse_articles(soup: BeautifulSoup):
"""Parser otimizado para estrutura 2025 do AdNews"""
results = []
articles = soup.select("article.result")
for article in articles:
try:
# Extração robusta
title_tag = article.select_one("h2.title a")
img_tag = article.select_one("img.attachment-full")
category_tag = article.select_one(".meta-category")
if not title_tag:
continue
url = title_tag['href'].strip()
if not url.startswith(('http', '//')):
url = f"{BASE_URL}{url}"
results.append({
"title": title_tag.get_text(strip=True),
"url": url,
"description": category_tag.get_text(strip=True) if category_tag else "",
"image": img_tag['src'] if img_tag else ""
})
except Exception as e:
print(f"Erro parsing: {str(e)}")
continue
return results[:15] # Limite de resultados