-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathjjwxc.py
98 lines (70 loc) · 3.17 KB
/
jjwxc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding:utf8 -*-
import requests
import lxml.html
from itertools import product
from time import sleep
import random
import sys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless') # Run Chrome in headless mode
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# on windows, use the path to the chromedriver.exe
if sys.platform == 'win32':
svc = webdriver.ChromeService(executable_path='./chromedriver.exe')
else:
svc = webdriver.ChromeService()
driver = webdriver.Chrome(service=svc, options=chrome_options)
def jj_Download(chapters_url, chapters_title, novel_name):
i = 0
for u, t in product(chapters_url, chapters_title):
i += 1
if len(chapters_url) < i:
return
print(t + " Downloading......")
# html = requests.get(chapters_url[i - 1]).content
driver.get(chapters_url[i - 1])
html = driver.page_source
# print(html)
# print(type(html))
selector = lxml.html.fromstring(html)
# clear_div = selector.xpath('//div[@style="clear:both;"]')
# print(clear_div)
# content_text = clear_div[0].xpath('following::text()[preceding::div[@style="clear:both;"]]')
# Use XPath to select the content between <div style="clear:both;"></div> and <div align="right"></div>
clear_div = selector.xpath('//div[@style="clear:both;"]')[0] # Select the first <div> with style="clear:both;"
right_div = clear_div.xpath('following::div[@align="right"]')[0] # Select the <div> following the clear_div
# Extract the text content between the two divs
content_text = right_div.xpath('preceding-sibling::text()')
if content_text:
content_text = '\n'.join(content_text).strip() # Join the text elements and remove leading/trailing whitespace
# print(content_text)
name = t
content = '\n' + "##" + name + '\n' + content_text
with open(novel_name, 'a', encoding="utf-8") as f:
f.write(content)
f.write('\n')
f.close()
sleep(random.random()*3+random.random())
id = input("Enter novelid:")
url = "http://www.jjwxc.net/onebook.php?novelid=" + id
res = requests.get(url).content
tree = lxml.html.fromstring(res)
chapters_url = tree.xpath('//tr[@itemprop="chapter"]//a/@href | //tr[@itemprop="chapter newestChapter"]//a/@href')
chapters_title = tree.xpath('//tr[@itemprop="chapter"]//a/text() | //tr[@itemprop="chapter newestChapter"]//a/text()')
novel = tree.xpath('//span[@itemprop="articleSection"]/text()')[0]
author = tree.xpath('//span[@itemprop="author"]/text()')[0]
print(f"Download {novel} by {author}")
novel_name = novel + " " + author + ".txt"
image_element = tree.xpath('//img[@class="noveldefaultimage"]')[0]
image_url = image_element.get('src')
# downlaod the cover image
response = requests.get(image_url)
filename = novel + '.jpg'
with open(filename, 'wb') as file:
file.write(response.content)
print('Cover image downloaded successfully!')
jj_Download(chapters_url, chapters_title, novel_name)
driver.quit()