-
Notifications
You must be signed in to change notification settings - Fork 130
/
Copy pathstep3.py
96 lines (86 loc) · 3.53 KB
/
step3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# encoding=utf-8
# ----------------------------------------------------------------------
# 作用:抓取商店信息
# 日期:2016-12-12
# 作者:九茶<http://blog.csdn.net/bone_ace>
# ----------------------------------------------------------------------
import pymongo
import requests
import re
from lxml import etree
from multiprocessing import Pool, cpu_count
client = pymongo.MongoClient('localhost', 27017)
db = client['1212']
collection_shops = db['Tmall_shops']
collection_items = db['Tmall_items']
collection_items_temp = db['Tmall_items_temp']
def parse(content, sourceURL, routine):
try:
text = content.replace('/', '/').replace('"', '"').replace('&', '&')
# 解析出json的URL
tree = etree.HTML(text)
site_instance_id = re.findall('site_instance_id=(\d+)', text)
data_widgetid = tree.xpath('//div[@class="J_TModule J_TAsyncModule"]/@data-widgetid')
flag = 0
if site_instance_id:
if (site_instance_id[0] + '-/p/shj.htm') in text: # 复杂的贱货
for elem in data_widgetid:
if int(elem) % 2 == 0:
continue
host = re.findall('//([^/]*)', sourceURL)
if host:
url = 'https://' + host[0] + '/widgetAsync.htm?ids=' + elem + '%2C' + str(int(elem) + 1) + \
'&path=%2Fp%2Fshj.htm&callback=callbackGetMods' + elem + '&site_instance_id=' + site_instance_id[0]
try:
flag += 1
collection_items_temp.insert({'_id': url, 'ShopURL': sourceURL, 'Type': routine['Type']})
except Exception, e:
pass
else:
print 'No host'
else:
for elem in data_widgetid:
host = re.findall('//([^/\?]*)', sourceURL)
if host:
url = 'https://' + host[0] + '/widgetAsync.htm?ids=' + elem + '&path=%2Fshop%2Fview_shop.htm&callback=callbackGetMods' + \
elem + '&site_instance_id=' + site_instance_id[0]
try:
flag += 1
collection_items_temp.insert({'_id': url, 'ShopURL': sourceURL, 'Type': routine['Type']})
except Exception, e:
pass
else:
print 'No host'
# 解析商品ID
items = re.findall('com/item\.htm[^"]*id=(\d+)', text)
for elem in list(set(items)):
try:
collection_items.insert({'_id': elem, 'ShopURL': sourceURL, 'Type': routine['Type']})
except Exception, e:
pass
return [flag, len(set(items))]
except Exception, e:
print e
return [0, 0]
def run(routine):
url = routine['_id']
if url.startswith('//'):
url = 'https:' + url
failure = 0
while failure < 10:
try:
r = requests.get(url, timeout=10)
except Exception, e:
print e
failure += 1
continue
temp, items = parse(r.content.decode('gbk', 'ignore'), url, routine)
print 'Successful: %s (Temp:%s; Items:%s)' % (routine['_id'], temp, items)
break
if failure >= 10:
print 'Failed: %s' % url
if __name__ == '__main__':
pool = Pool(cpu_count())
pool.map(run, collection_shops.find())
pool.close()
pool.join()