-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDigikeyCrawler.py
63 lines (47 loc) · 1.74 KB
/
DigikeyCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
import csv
import re
from urllib.parse import quote
import httplib2
from bs4 import BeautifulSoup
from labelannotator import *
URL_PREFIX = 'http://search.digikey.com/scripts/DkSearch/dksus.dll?Detail?name='
h = httplib2.Http('.cache')
PARAMETRICS_BLACKLIST = ['Quantity Available']
HEADER_CONTINUE = ['Categories']
def parse_digikey_table(soup_table):
elements = {}
last_header = None
for row in soup_table.findChildren('tr'):
header = row.find('th')
value = row.find('td')
if value is None:
continue
if header is None and last_header in HEADER_CONTINUE:
header = last_header
elif header is not None:
header = header.get_text().strip()
last_header = header
if header not in PARAMETRICS_BLACKLIST:
elements[header] = value.get_text().strip()
return elements
def DigikeyCrawl(row_dict):
if 'digikey_pn' not in row_dict or not row_dict['digikey_pn']:
return {}
url = URL_PREFIX + quote(row_dict['digikey_pn'])
print("Fetch digikey_pn='%s' from %s" % (row_dict['digikey_pn'], url))
_, content = h.request(url, headers={'user-agent': '=)'})
content = content.decode('utf-8')
# The part attributes table has a hanging </a> tag. Fail...
content = re.sub(r'</a>', '', content)
content = re.sub(r'<a[^>]*>', '', content)
content = content.replace(' ', '')
content = content.replace('\n', '')
content = content.replace('\t', '')
soup = BeautifulSoup(content, 'html.parser')
parametrics = {}
parametrics.update(parse_digikey_table(soup.find('table', id='product-overview')))
parametrics.update(parse_digikey_table(soup.find('table', id='product-attribute-table')))
return {'parametrics': str(parametrics)}
load().map_append(DigikeyCrawl) \
.write()