-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path爬取腾讯招聘.py
103 lines (83 loc) · 3.06 KB
/
爬取腾讯招聘.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import requests
from datetime import datetime
from lxml import html
import csv
import time
from selenium import webdriver
#构造获取详情页的函数
def fetch_links(page_index):
base_url = "https://careers.tencent.com/tencentcareer/api/post/Query"
params = {
'timestamp': int(datetime.timestamp(datetime.now()) * 1000),
'countryId': '',
'cityId': '',
'bgIds': '',
'productId': '',
'categoryId': '40001001,40001002,40001003,40001004,40001005,40001006',
'parentCategoryId': '',
'attrId': '1',
'keyword': '',
'pageIndex': page_index,
'pageSize': 10,
'language': 'zh-cn',
'area': 'cn',
}
# 发送请求
response = requests.get(base_url, params=params)
# 处理响应
if response.status_code == 200:
data = response.json()
links = [post['PostURL'] for post in data.get('Data', {}).get('Posts', [])]
return links
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return []
def get_detail_info(link):
# 使用 ChromeDriver
chrome_options = webdriver.ChromeOptions()
# 在无头模式下运行,无需打开浏览器窗口
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
driver.get(link)
# 使用延迟等待确保页面加载完成(可以根据实际情况调整等待时间)
time.sleep(2)
# 获取网页源码
page_source = driver.page_source
# 使用 lxml 解析源码
tree = html.fromstring(page_source)
# 在这里可以使用 tree 来执行 XPath 查询等操作
title = tree.xpath("//div[@class='job-text-wrapper']/span/text()")
duty = tree.xpath("//div[@class='duty work-module']//li/text()")
requirements = tree.xpath("//div[@class='requirement work-module']//li/text()")
add_points = tree.xpath("//div[@class='work-module']//li/text()")
#使用xpath语法提取需要的信息
detail_info = {
'title': title[0] if title else None,
'duty': duty[0] if duty else None,
'requirements': requirements[0] if requirements else None,
'add_points': add_points[0] if add_points else None,
"link": link
}
# 关闭浏览器
driver.close()
return detail_info
#定义一个空列表
all_details = []
#遍历详情页获取信息
for page_index in range(1, 3):
links = fetch_links(page_index)
if links:
for link in links:
detail_info = get_detail_info(link)
print(detail_info)
all_details.append(detail_info)
print("正在打印第{}页".format(page_index))
field_names = ['title', 'duty', 'requirements', 'add_points', 'link']
# 将详细信息保存为 CSV 文件
with open('details.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.DictWriter(file, fieldnames=field_names)
# 写入 CSV 文件头部
writer.writeheader()
# 写入详细信息
writer.writerows(all_details)
print("Details saved to 'details.csv'")