Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
hxse committed Sep 22, 2024
1 parent 74b1f8e commit 9b234b6
Show file tree
Hide file tree
Showing 8 changed files with 1,046 additions and 429 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
extensions/
error_logs/
output/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
1 change: 1 addition & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"乙アリス",
"斎藤あみり",
"桜空もも",
"日向なつ",
"桃乃木かな",
"三宮つばき",
"宮島めい",
Expand Down
88 changes: 88 additions & 0 deletions get_botasaurus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from botasaurus.request import request, Request
from botasaurus.soupify import soupify
from botasaurus.browser import browser, Driver
from chrome_extension_python import Extension
import re

headless = True


def operate_jable_playwright(url, mode="browser_"):
if mode == "browser":
return run_browser(url)
else:
return run_request(url)


def jable_favourite_playwright(url):
pass


def callback(url, soup, drive=None, response=None):
print(url)
title = soup.find("h4").get_text()
av_id = url.split("?")[0].split("/")[4]
count = soup.select(".count")[0].text
view = soup.select("span.mr-3")[1].text.replace(" ", "")

models = []
_models = soup.select(".models .model")
for i in _models:
try:
title = i.select("span")[0]["data-original-title"]
except KeyError:
title = i.select("span")[0]["title"]
except IndexError:
title = i.select("img")[0]["title"]
models.append({"title": title, "href": i["href"]})

tags = []
_tags = soup.select(".tags a")
for i in _tags:
tags.append({"tag": i.text, "href": i["href"]})

if drive:
hsl = drive.run_js("return hlsUrl")
else:
script = soup.select("#site-content")[0].find_all("script")[1].contents[0]
hsl = re.findall("hlsUrl = '(.*)';", script)[0]

return {
"title": title,
"av_id": av_id,
"url": url,
"hsl": hsl,
"count": count,
"view": view,
"models": models,
"tags": tags,
}


@request(output=None)
def run_request(request: Request, url):
response = request.get(url)
soup = soupify(response)
return callback(url, soup)


@browser(
extensions=[
Extension(
# "https://chromewebstore.google.com/detail/adblock-%E2%80%94-best-ad-blocker/gighmmpiobklfepjocnamgkkbiglidom"
"https://chromewebstore.google.com/detail/ublock-origin/cjpalhdlnbpafiamejdnhcphjbkeiagm"
)
],
output=None,
headless=True,
)
def run_browser(driver: Driver, url):
driver.get(url)
soup = soupify(driver)
return callback(url, soup, drive=driver)


if __name__ == "__main__":
url = "https://jable.tv/videos/cawd-240/"
data = operate_jable_playwright(url, mode="browser_")
print(data)
30 changes: 21 additions & 9 deletions get_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from pathlib import Path
import fire
import json
from get_playwright import operate_jable_playwright, jable_favourite_playwright

# from get_playwright import operate_jable_playwright, jable_favourite_playwright
from get_botasaurus import operate_jable_playwright, jable_favourite_playwright
from config import sort_list


Expand Down Expand Up @@ -67,9 +69,11 @@ def operate_jable(r):
raise Exception(f"请求失败 {r.status_code} {r.url}")


async def get_jable_one(url, get_mode="playwright"):
async def get_jable_one(url, get_mode="botasaurus"):
if get_mode == "playwright":
return await operate_jable_playwright(url, headless=True)
elif get_mode == "botasaurus":
return operate_jable_playwright(url)
else:
tasks = (grequests.get(u, proxies=proxies, headers=headers) for u in [url])
for r in grequests.imap(tasks, size=6):
Expand Down Expand Up @@ -99,7 +103,7 @@ async def loop_download_info(
playlist=True,
message=False,
playlsit_message=False,
get_mode="playwright",
get_mode="botasaurus",
):
allUrls = []
urls = []
Expand Down Expand Up @@ -132,6 +136,10 @@ def write_json(data):
for i in urls:
data = await operate_jable_playwright(i)
write_json(data)
elif get_mode == "botasaurus":
for i in urls:
data = operate_jable_playwright(i)
write_json(data)
else:
tasks = (grequests.get(u, proxies=proxies, headers=headers) for u in urls)
for resp in grequests.imap(tasks, size=6):
Expand Down Expand Up @@ -385,6 +393,7 @@ async def create_playlist(
mode="jable",
message=True,
update=True,
enable_favourite=False,
):
if update:
create_playlist_tag(
Expand All @@ -397,12 +406,15 @@ async def create_playlist(

empty_file_arr = check_m3u8_file(dirPath, playlistPath, clean_empty_file=True)

await create_playlist_favourite(
dirPath=dirPath,
playlistPath=playlistPath,
mode=mode,
message=message,
)
if not enable_favourite:
print("skip favourite")
else:
await create_playlist_favourite(
dirPath=dirPath,
playlistPath=playlistPath,
mode=mode,
message=message,
)

empty_file_arr = [
*empty_file_arr,
Expand Down
1 change: 0 additions & 1 deletion get_m3u8.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import cloudscraper
from config import headers, proxies
import re
import os
Expand Down
23 changes: 19 additions & 4 deletions get_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ async def jable_favourite_playwright(
url="https://jable.tv/members/297827/", localCount=None, headless=True
):
async with async_playwright() as p:
for browser_type in [p.firefox]: # p.chromium, 用chromium会被检测到, firefox不会
for browser_type in [
p.firefox
]: # p.chromium, 用chromium会被检测到, firefox不会
browser = await browser_type.launch(headless=headless)
page = await browser.new_page()
await page.goto(url, timeout=timeout)
Expand All @@ -63,20 +65,27 @@ async def jable_favourite_playwright(
return

data = await recursion_find_button(page)
assert count == len(data), f"数目不对,请检查,预期数目:{count}, 实际数目:{len(data)}"
assert count == len(
data
), f"数目不对,请检查,预期数目:{count}, 实际数目:{len(data)}"
return data


async def operate_jable_playwright(url, headless=True):
async def operate_jable_playwright(url, headless=False):
async with async_playwright() as p:
for browser_type in [p.firefox]: # p.chromium, 用chromium会被检测到, firefox不会
for browser_type in [
p.firefox
]: # p.chromium, 用chromium会被检测到, firefox不会
browser = await browser_type.launch(
headless=headless,
# executablePath="C:\\Users\\hxse\\AppData\\Local\\ms-playwright\\firefox-1335\\firefox\\firefox.exe",
)
page = await browser.new_page()
await page.goto(url, timeout=timeout)
el = await page.query_selector("div.info-header")
import pdb

pdb.set_trace()
titleEl = await el.query_selector(".header-left h4")
countEl = await page.query_selector(".count")
viewEl = await el.query_selector_all("span.mr-3")
Expand Down Expand Up @@ -124,6 +133,12 @@ async def operate_jable_playwright(url, headless=True):


if __name__ == "__main__":
"""
"pycryptodome",
"cloudscraper==1.2.58",
"playwright-stealth==1.0.5",
"playwright==1.32.1",
"""
url = "https://jable.tv/videos/ipx-252-c/"
obj = asyncio.run(operate_jable_playwright(url, headless=True))
print(obj)
Loading

0 comments on commit 9b234b6

Please sign in to comment.