-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathscrapeVehicles.py
323 lines (280 loc) · 15.5 KB
/
scrapeVehicles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
# scrapeVehicles.py loops through all cities listed on Craigslist and scrapes every vehicle for sale and adds it to cities.db
# this program will take awhile to run, don't run unless you're willing to let it sit for at least 2 hours
# the database is commited after we've finished scraping each city, so the program can be terminated and results are still saved if you need to exit for any reason
import os
import psycopg2
from json import loads
from lxml import html
from datetime import datetime
from requests_html import HTMLSession
from connect import connect
from crawlCities import storeCities
def runScraper():
storeCities()
conn = connect()
#the cities table contains around 480 cities, all of the craigslist pages in north america
curs = conn.cursor()
curs.execute("SELECT * FROM cities")
citiesList = []
for city in curs.fetchall():
citiesList.append(city)
curs.execute('''CREATE TABLE IF NOT EXISTS vehicles(id BIGINT PRIMARY KEY, url TEXT, region TEXT, region_url TEXT,
price BIGINT, year BIGINT, manufacturer TEXT, model TEXT, condition TEXT, cylinders TEXT, fuel TEXT,
odometer BIGINT, title_status TEXT, transmission TEXT, VIN TEXT, drive TEXT, size TEXT, type TEXT, paint_color TEXT, image_url TEXT,
description TEXT, state TEXT, lat REAL, long REAL, posting_date TEXT)''')
session = HTMLSession()
#scraped counts all entries gathered
scraped = 0
#carBrands dictate what qualifies as a brand so we can snatch that data from the 'model' tag
carBrands = ["ford", "toyota", "chevrolet", "chev", "chevy", "honda", "jeep", "hyundai", "subaru",
"kia", "gmc", "ram", "dodge", "mercedes-benz", "mercedes", "mercedesbenz",
"volkswagen", "vw", "bmw", "saturn", "land rover", "landrover", "pontiac",
"mitsubishi", "lincoln", "volvo", "mercury", "harley-davidson", "harley",
"rover", "buick", "cadillac", "infiniti", "infinity", "audi", "mazda", "chrysler",
"acura", "lexus", "nissan", "datsun", "jaguar", "alfa", "alfa-romeo", "aston", "aston-martin",
"ferrari", "fiat", "hennessey", "porsche", "noble", "morgan", "mini", "tesla"]
#if the car year is beyond next year, we toss it out. this variable is used later
nextYear = datetime.now().year + 1
#simple txt file mechanism to track scraping progress
fileName = os.path.dirname(os.path.abspath(__file__)) + "/static/trackVehicleScraping.txt"
exists = os.path.isfile(fileName)
if not exists:
tracker = open(fileName, "w")
tracker.write("0")
tracker.close()
with open(fileName, "r") as tracker:
cities = int(tracker.readlines()[0])
citiesCount = len(citiesList)
citiesList = citiesList[cities:]
for city in citiesList:
scrapedInCity = 0
cities += 1
print(f"Scraping vehicles from {city[2]}, {citiesCount - cities} cities remain")
empty = False
#scrapedIds is used to store each individual vehicle id from a city, therefore we can delete vehicle records from the database
#if their id is no longer in scrapedIds under the assumption that the entry has been removed from craigslist
scrapedIds = set([])
#track items skipped that are already in the database
skipped = 0
#this loop executes until we are out of search results, craigslist sets this limit at 3000 and cities often contain the full 3000 records (but not always)
while not empty:
print(f"Gathering entries {scrapedInCity} through {scrapedInCity + 120}")
#now we scrape
try:
searchUrl = f"{city[1]}/d/cars-trucks/search/cta?s={scrapedInCity}"
page = session.get(searchUrl)
except Exception as e:
#catch any excpetion and continue the loop if we cannot access a site for whatever reason
print(f"Failed to reach {searchUrl}, entries have been dropped: {e}")
scrapedInCity += 120
continue
#each search page contains 120 entries
scrapedInCity += 120
tree = html.fromstring(page.content)
#the following line returns a list of urls for different vehicles
vehicles = tree.xpath('//a[@class="result-image gallery"]')
if len(vehicles) == 0:
#if we no longer have entries, continue to the next city
empty = True
continue
vehiclesList = []
for item in vehicles:
vehicleDetails = []
vehicleDetails.append(item.attrib["href"])
try:
#this code attempts to grab the price of the vehicle. some vehicles dont have prices (which throws an exception)
#and we dont want those which is why we toss them
vehicleDetails.append(item[0].text)
except:
continue
vehiclesList.append(vehicleDetails)
#loop through each vehicle
for item in vehiclesList:
url = item[0]
try:
idpk = int(url.split("/")[-1].strip(".html"))
except ValueError as e:
print("{} does not have a valid id: {}".format(url, e))
#add the id to scrapedIds for database cleaning purposes
scrapedIds.add(idpk)
#vehicle id is a primary key in this database so we cant have repeats. if a record with the same url is found, we continue
#the loop as the vehicle has already been stored
curs.execute(f"SELECT 1 FROM vehicles WHERE id = {idpk}")
if len(curs.fetchall()) != 0:
skipped += 1
continue
vehicleDict = {}
vehicleDict["price"] = int(item[1].replace(",", "").strip("$"))
try:
#grab each individual vehicle page
page = session.get(url)
tree = html.fromstring(page.content)
except:
print(f"Failed to reach {url}, entry has been dropped")
continue
attrs = tree.xpath('//span//b')
#this fetches a list of attributes about a given vehicle. each vehicle does not have every specific attribute listed on craigslist
#so this code gets a little messy as we need to handle errors if a car does not have the attribute we're looking for
for item in attrs:
try:
#model is the only attribute without a specific tag on craigslist, so if this code fails it means that we've grabbed the model of the vehicle
k = item.getparent().text.strip()
k = k.strip(":")
except:
k = "model"
try:
#this code fails if item=None so we have to handle it appropriately
vehicleDict[k] = item.text.strip()
except:
continue
#we will assume that each of these variables are None until we hear otherwise
#that way, try/except clauses can simply pass and leave these values as None
price = None
year = None
manufacturer = None
model = None
condition = None
cylinders = None
fuel = None
odometer = None
title_status = None
transmission = None
VIN = None
drive = None
size = None
vehicle_type = None
paint_color = None
image_url = None
lat = None
long = None
description = None
posting_date = None
#now this code gets redundant. if we picked up a specific attr in the vehicleDict then we can change the variable from None.
#integer attributes (price/odometer) are handled in case the int() is unsuccessful, but i have never seen that be the case
if "price" in vehicleDict:
try:
price = int(vehicleDict["price"])
except Exception as e:
print(f"Could not parse price: {e}")
if "odomoter" in vehicleDict:
try:
odometer = int(vehicleDict["odometer"])
except Exception as e:
print(f"Could not parse odometer: {e}")
if "condition" in vehicleDict:
condition = vehicleDict["condition"]
if "model" in vehicleDict:
#model actually contains 3 variables that we'd like: year, manufacturer, and model (which we call model)
try:
year = int(vehicleDict["model"][:4])
if year > nextYear:
year = None
except:
year = None
model = vehicleDict["model"][5:]
foundManufacturer = False
#we parse through each word in the description and search for a match with carBrands (at the top of the program)
#if a match is found then we have our manufacturer, otherwise we set model to the entire string and leave manu blank
for word in model.split():
if word.lower() in carBrands:
foundManufacturer = True
model = ""
#resolve conflicting manufacturer titles
manufacturer = word.lower()
if manufacturer == "chev" or manufacturer == "chevy":
manufacturer = "chevrolet"
if manufacturer == "mercedes" or manufacturer == "mercedesbenz":
manufacturer = "mercedes-benz"
if manufacturer == "vw":
manufacturer = "volkswagen"
if manufacturer == "landrover":
manufacturer = "land rover"
if manufacturer == "harley":
manufacturer = "harley-davidson"
if manufacturer == "infinity":
manufacturer = "infiniti"
if manufacturer == "alfa":
manufacturer = "alfa-romeo"
if manufacturer == "aston":
manufacturer = "aston-martin"
continue
if foundManufacturer:
model = model + word.lower() + " "
model = model.strip()
if "cylinders" in vehicleDict:
cylinders = vehicleDict["cylinders"]
if "fuel" in vehicleDict:
fuel = vehicleDict["fuel"]
if "odometer" in vehicleDict:
odometer = vehicleDict["odometer"]
if "title status" in vehicleDict:
title_status = vehicleDict["title status"]
if "transmission" in vehicleDict:
transmission = vehicleDict["transmission"]
if "VIN" in vehicleDict:
VIN = vehicleDict["VIN"]
if "drive" in vehicleDict:
drive = vehicleDict["drive"]
if "size" in vehicleDict:
size = vehicleDict["size"]
if "type" in vehicleDict:
vehicle_type = vehicleDict["type"]
if "paint color" in vehicleDict:
paint_color = vehicleDict["paint color"]
#now lets fetch the image url if exists
try:
img = tree.xpath('//div[@class="slide first visible"]//img')
image_url = img[0].attrib["src"]
except:
pass
#try to fetch lat/long and city/state, remain as None if they do not exist
try:
location = tree.xpath("//div[@id='map']")
lat = float(location[0].attrib["data-latitude"])
long = float(location[0].attrib["data-longitude"])
except Exception as e:
pass
#try to fetch a vehicle description, remain as None if it does not exist
try:
location = tree.xpath("//section[@id='postingbody']")
description = location[0].text_content().replace("\n", " ").replace("QR Code Link to This Post", "").strip()
except:
pass
try:
posting_date = tree.xpath("//div[@class='postinginfos']//p[@class='postinginfo reveal']//time")[0].get("datetime")
except Exception as e:
print(e)
#finally we get to insert the entry into the database
curs.execute('''INSERT INTO vehicles(id, url, region, region_url, price, year, manufacturer, model, condition,
cylinders, fuel,odometer, title_status, transmission, VIN, drive, size, type,
paint_color, image_url, description, lat, long, state, posting_date)
VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''',
(idpk, url, city[2], city[1], price, year, manufacturer, model, condition, cylinders,
fuel, odometer, title_status, transmission, VIN, drive,
size, vehicle_type, paint_color, image_url, description, lat, long, city[3], posting_date))
scraped += 1
#these lines will execute every time we grab a new page (after 120 entries)
print("{} vehicles scraped".format(scraped))
#now to clean the database we grab all urls from the city that are already logged
curs.execute("SELECT id FROM vehicles WHERE region_url = '{}'".format(city[1]))
deleted = 0
#if a given id is not in scrapedIds (the ids that we just scraped) then the entry no longer exists and we remove it
for oldId in curs.fetchall():
if int(oldId[0]) not in scrapedIds:
curs.execute("DELETE FROM vehicles WHERE id = '{}'".format(oldId[0]))
deleted += 1
print("Deleted {} old records, {} records skipped as they are already stored".format(deleted, skipped))
conn.commit()
#update progress file
with open(fileName, "w") as tracker:
tracker.write(str(cities))
#delete tracker file
os.remove(fileName)
count = curs.execute("SELECT Count(*) FROM vehicles")
print("Table vehicles successfully updated, {} entries exist".format(\
curs.fetchall()[0][0]))
conn.close()
def main():
runScraper()
if __name__ == "__main__":
main()