-
Notifications
You must be signed in to change notification settings - Fork 0
/
CarousellClicker.py
120 lines (100 loc) · 4.76 KB
/
CarousellClicker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import requests
import time
from pathlib import Path
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
#####################################################
#################### USER INPUTS ####################
#####################################################
# Source: http://image-net.org/challenges/LSVRC/2014/browse-synsets
# search_terms is a dictionary of search_term: list of illegal_entities
search_terms = {
'tiger': {'tiger cat', 'tiger'}
}
#####################################################
################### DEFAULT INPUTS ##################
#####################################################
# Do not change unless you know what you are doing
download_directory = 'CarousellClicker'
load_more_button_xpath = '//button[text()="Load more"]'
wait_in_seconds = 10
#####################################################
############## GENERAL SPIDER HELPERS ###############
#####################################################
# TODO: Move this section to a helper file
def download_image(image_url, download_file_path):
with open(download_file_path, 'wb') as f:
f.write(requests.get(image_url).content)
def is_illegal(image_file_path, illegal_entities):
stream = os.popen(f'. predict.sh {image_file_path}')
predictions = list(map(lambda s: s.split(':')[-1].strip(), stream.readlines()))
# TODO: Add confidence threshold here
return predictions[0] in illegal_entities # Top 1 only
#####################################################
############# CAROUSELL SPIDER HELPERS ##############
#####################################################
def get_carousell_search_url(search_term):
return f'https://sg.carousell.com/search/{search_term}' # ?sort_by=time_created,descending'
def is_initial_state(line):
return line.lstrip().startswith('<script>window.initialState=')
def is_product_img(image_url):
return image_url.startswith('https://media.karousell.com/media/photos/products/')
# https://www.hackerearth.com/practice/notes/praveen97uma/crawling-a-website-that-loads-content-using-javascript-with-selenium-webdriver-in-python
def page_down(browser, page_downs):
body = browser.find_element_by_tag_name('body')
while page_downs:
body.send_keys(Keys.PAGE_DOWN)
time.sleep(1) # So that I can see that stuff really happened
page_downs -= 1
return browser
# NOT USING THIS FOR NOW.
# The website I'm crawling does not seem to react well to this scroll-down program. Perhaps it is
# too aggressive. Refactored from:
# https://stackoverflow.com/questions/22702277/crawl-site-that-has-infinite-scrolling-using-python
def scroll_down(browser):
per_scroll = 200
max_height = browser.execute_script('return document.body.scrollHeight')
new_height = per_scroll
while max_height > new_height:
browser.execute_script(f'window.scrollTo(0, {new_height})')
time.sleep(1) # So that I can see that stuff really happened
max_height = browser.execute_script('return document.body.scrollHeight')
new_height += per_scroll
#####################################################
################# CAROUSELL SPIDER ##################
#####################################################
Path(download_directory).mkdir(parents=True, exist_ok=True)
illegal_items = []
browser = webdriver.Chrome()
for search_term in search_terms:
images_to_download = []
try:
browser.get(get_carousell_search_url(search_term))
browser = page_down(browser, 4) # Arbitrary
is_element_present = EC.presence_of_element_located((By.XPATH, load_more_button_xpath))
WebDriverWait(browser, wait_in_seconds).until(is_element_present)
load_more_button = browser.find_element_by_xpath(load_more_button_xpath)
load_more_button.click()
time.sleep(wait_in_seconds)
browser = page_down(browser, 6) # Arbitrary
for image in browser.find_elements_by_tag_name('img'):
image_url = image.get_attribute('src')
if is_product_img(image_url):
images_to_download.append(image_url)
except TimeoutException:
print('Timed out.')
print(images_to_download) # Remove when not needed
if images_to_download:
for i in range(len(images_to_download)):
image_url = images_to_download[i]
download_file_path = f'{download_directory}/{i}.jpg'
download_image(image_url, download_file_path)
if is_illegal(download_file_path, search_terms[search_term]):
illegal_items.append(image_url)
browser.quit()
print(illegal_items)