Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace *.deepwisdom.ai with a local server for scraping #1081

Merged
merged 1 commit into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion metagpt/tools/web_browser_engine_selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(self, proxy: str = None):

def get(self, url, **kwargs):
if "proxies" not in kwargs and self.proxy:
kwargs["proxies"] = {"all_proxy": self.proxy}
kwargs["proxies"] = {"all": self.proxy}
return super().get(url, **kwargs)


Expand Down
9 changes: 5 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,13 @@ async def pipe(reader, writer):
while not reader.at_eof():
writer.write(await reader.read(2048))
writer.close()
await writer.wait_closed()

async def handle_client(reader, writer):
data = await reader.readuntil(b"\r\n\r\n")
print(f"Proxy: {data}") # checking with capfd fixture
infos = pattern.match(data)
host, port = infos.group("host"), infos.group("port")
print(f"Proxy: {host}") # checking with capfd fixture
port = int(port) if port else 80
remote_reader, remote_writer = await asyncio.open_connection(host, port)
if data.startswith(b"CONNECT"):
Expand Down Expand Up @@ -257,10 +258,10 @@ async def start():
server = aiohttp.web.Server(handler)
runner = aiohttp.web.ServerRunner(server)
await runner.setup()
site = aiohttp.web.TCPSite(runner, "localhost", 0)
site = aiohttp.web.TCPSite(runner, "127.0.0.1", 0)
await site.start()
host, port = site._server.sockets[0].getsockname()
return site, f"http://{host}:{port}"
_, port, *_ = site._server.sockets[0].getsockname()
return site, f"http://127.0.0.1:{port}"

return start

Expand Down
5 changes: 3 additions & 2 deletions tests/metagpt/tools/libs/test_web_scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@


@pytest.mark.asyncio
async def test_scrape_web_playwright():
test_url = "https://www.deepwisdom.ai"
async def test_scrape_web_playwright(http_server):
server, test_url = await http_server()

result = await scrape_web_playwright(test_url)

Expand All @@ -21,3 +21,4 @@ async def test_scrape_web_playwright():
assert not result["inner_text"].endswith(" ")
assert not result["html"].startswith(" ")
assert not result["html"].endswith(" ")
await server.stop()
11 changes: 7 additions & 4 deletions tests/metagpt/tools/test_web_browser_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@

@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, url, urls",
"browser_type",
[
(WebBrowserEngineType.PLAYWRIGHT, "https://deepwisdom.ai", ("https://deepwisdom.ai",)),
(WebBrowserEngineType.SELENIUM, "https://deepwisdom.ai", ("https://deepwisdom.ai",)),
WebBrowserEngineType.PLAYWRIGHT,
WebBrowserEngineType.SELENIUM,
],
ids=["playwright", "selenium"],
)
async def test_scrape_web_page(browser_type, url, urls):
async def test_scrape_web_page(browser_type, http_server):
server, url = await http_server()
urls = [url, url, url]
browser = web_browser_engine.WebBrowserEngine(engine=browser_type)
result = await browser.run(url)
assert isinstance(result, WebPage)
Expand All @@ -27,6 +29,7 @@ async def test_scrape_web_page(browser_type, url, urls):
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
await server.stop()


if __name__ == "__main__":
Expand Down
26 changes: 19 additions & 7 deletions tests/metagpt/tools/test_web_browser_engine_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,28 @@

@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, use_proxy, kwagrs, url, urls",
"browser_type, use_proxy, kwagrs,",
[
("chromium", {"proxy": True}, {}, "https://www.deepwisdom.ai", ("https://www.deepwisdom.ai",)),
("firefox", {}, {"ignore_https_errors": True}, "https://www.deepwisdom.ai", ("https://www.deepwisdom.ai",)),
("webkit", {}, {"ignore_https_errors": True}, "https://www.deepwisdom.ai", ("https://www.deepwisdom.ai",)),
("chromium", {"proxy": True}, {}),
(
"firefox",
{},
{"ignore_https_errors": True},
),
(
"webkit",
{},
{"ignore_https_errors": True},
),
],
ids=["chromium-normal", "firefox-normal", "webkit-normal"],
)
async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy, capfd):
async def test_scrape_web_page(browser_type, use_proxy, kwagrs, proxy, capfd, http_server):
server, url = await http_server()
urls = [url, url, url]
proxy_url = None
if use_proxy:
server, proxy_url = await proxy()
proxy_server, proxy_url = await proxy()
browser = web_browser_engine_playwright.PlaywrightWrapper(browser_type=browser_type, proxy=proxy_url, **kwagrs)
result = await browser.run(url)
assert isinstance(result, WebPage)
Expand All @@ -32,8 +42,10 @@ async def test_scrape_web_page(browser_type, use_proxy, kwagrs, url, urls, proxy
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
if use_proxy:
server.close()
proxy_server.close()
await proxy_server.wait_closed()
assert "Proxy:" in capfd.readouterr().out
await server.stop()


if __name__ == "__main__":
Expand Down
32 changes: 15 additions & 17 deletions tests/metagpt/tools/test_web_browser_engine_selenium.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import browsers
import pytest

Expand All @@ -10,51 +11,48 @@

@pytest.mark.asyncio
@pytest.mark.parametrize(
"browser_type, use_proxy, url, urls",
"browser_type, use_proxy,",
[
pytest.param(
"chrome",
True,
"https://deepwisdom.ai",
("https://deepwisdom.ai",),
False,
marks=pytest.mark.skipif(not browsers.get("chrome"), reason="chrome browser not found"),
),
pytest.param(
"firefox",
False,
"https://deepwisdom.ai",
("https://deepwisdom.ai",),
marks=pytest.mark.skipif(not browsers.get("firefox"), reason="firefox browser not found"),
),
pytest.param(
"edge",
False,
"https://deepwisdom.ai",
("https://deepwisdom.ai",),
marks=pytest.mark.skipif(not browsers.get("msedge"), reason="edge browser not found"),
),
],
ids=["chrome-normal", "firefox-normal", "edge-normal"],
)
async def test_scrape_web_page(browser_type, use_proxy, url, urls, proxy, capfd):
async def test_scrape_web_page(browser_type, use_proxy, proxy, capfd, http_server):
# Prerequisites
# firefox, chrome, Microsoft Edge
server, url = await http_server()
urls = [url, url, url]
proxy_url = None
if use_proxy:
server, proxy_url = await proxy()
proxy_server, proxy_url = await proxy()
browser = web_browser_engine_selenium.SeleniumWrapper(browser_type=browser_type, proxy=proxy_url)
result = await browser.run(url)
assert isinstance(result, WebPage)
assert "MetaGPT" in result.inner_text

if urls:
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
results = await browser.run(url, *urls)
assert isinstance(results, list)
assert len(results) == len(urls) + 1
assert all(("MetaGPT" in i.inner_text) for i in results)
if use_proxy:
server.close()
assert "Proxy:" in capfd.readouterr().out
proxy_server.close()
await proxy_server.wait_closed()
assert "Proxy: localhost" in capfd.readouterr().out
await server.stop()


if __name__ == "__main__":
Expand Down
Loading