Skip to content

Commit

Permalink
Merge pull request #2 from ieguiguren/samair
Browse files Browse the repository at this point in the history
Samair
  • Loading branch information
pgaref authored Jul 13, 2016
2 parents dc09188 + d618713 commit 9cc1278
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ The project code in this repository is crawling three different public proxy web
* http://proxyfor.eu/geo.php
* http://free-proxy-list.net
* http://rebro.weebly.com/proxy-list.html
* http://www.samair.ru/proxy/time-01.htm

After collecting the proxy data and filtering the slowest ones it is randomly selecting one of them to query the target url.
The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.
Expand Down
31 changes: 31 additions & 0 deletions project/http/requests/proxy/requestProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def __init__(self, web_proxy_list=[]):
self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php', 100.0)
self.proxy_list += self.freeProxy_url_parser('http://free-proxy-list.net')
self.proxy_list += self.weebly_url_parser('http://rebro.weebly.com/proxy-list.html')
self.proxy_list += self.samair_url_parser('http://www.samair.ru/proxy/time-01.htm')


def get_proxy_list(self):
return self.proxy_list
Expand Down Expand Up @@ -127,6 +129,34 @@ def weebly_url_parser(self, web_url):
curr_proxy_list.append(proxy.__str__())
return curr_proxy_list

def samair_url_parser(self, web_url, speed_in_KBs=100.0):
curr_proxy_list = []
content = requests.get(web_url).content
soup = BeautifulSoup(content, "html.parser")
# css provides the port number so we reverse it
for href in soup.findAll('link'):
if '/styles/' in href.get('href'):
style = "http://www.samair.ru" + href.get('href')
break
css = requests.get(style).content.split('\n')
css.pop()
ports = {}
for l in css:
p = l.split(' ')
key = p[0].split(':')[0][1:]
value = p[1].split('\"')[1]
ports[key] = value

table = soup.find("table", attrs={"id": "proxylist"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

for row in table.find_all("span")[1:]:
curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])

return curr_proxy_list

def generate_proxied_request(self, url, params={}, req_timeout=30):
#if len(self.proxy_list) < 2:
# self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php')
Expand All @@ -137,6 +167,7 @@ def generate_proxied_request(self, url, params={}, req_timeout=30):
request = None
try:
rand_proxy = random.choice(self.proxy_list)
print "Next proxy: " + str(rand_proxy)
request = requests.get(test_url, proxies={"http": rand_proxy},
headers=req_headers, timeout=req_timeout)
except ConnectionError:
Expand Down

0 comments on commit 9cc1278

Please sign in to comment.