-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scrape.py
57 lines (46 loc) · 1.9 KB
/
web_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
__author__ = 'Max Rosett'
from bs4 import BeautifulSoup
import requests
import random
import time
import pandas as pd
'''
def clean_housing(housing_str):
TODO: Add method for parsing this
return housing_str'''
next = 'https://sfbay.craigslist.org/search/sfc/apa'
listings = []
while next is not None:
result = requests.get(next)
soup = BeautifulSoup(result.content)
#result = open("sf_apts.html","r").read()
#soup = BeautifulSoup(result)
if soup.find('link', rel='next') is not None:
next = soup.find('link', rel='next')['href']
else:
next = None
print(next)
for listing in soup.find_all("p", "row"):
list_dict = {}
list_dict['data-pid'] = listing['data-pid']
if listing.has_key('data-repost-of'):
list_dict['data-repost-of'] = listing['data-repost-of']
if listing.find("a", "hdrlnk") is not None:
list_dict['href'] = listing.find("a", "hdrlnk")['href']
if listing.find("a", "hdrlnk") is not None:
list_dict['description'] = listing.find("a", "hdrlnk").get_text().strip()
if listing.find("span", "price") is not None:
list_dict['price'] = listing.find("span", "price").get_text().strip()
if listing.find("span", "housing") is not None:
list_dict['housing'] = listing.find("span", "housing").get_text().strip()
if listing.find("span", "pnr") is not None and listing.find("span", "pnr").find("small") is not None:
list_dict['neighborhood'] = listing.find("span", "pnr").find("small").get_text().strip()
if listing.find("time") is not None:
list_dict['time'] = listing.find("time")['datetime']
listings.append(list_dict)
time.sleep(random.randint(5,10))
df = pd.DataFrame(listings)
df.to_csv('sf_scrape.csv')
'''saved_html= open("sf_apts.html","w")
saved_html.write(str(soup.prettify()))
saved_html.close()'''