-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_recursively.py
executable file
·50 lines (46 loc) · 1.87 KB
/
fetch_recursively.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python
from sys import argv
import random
import urllib
def fetch(src_url, level):
## first fetch the source HTML page
num = 100000 * random.random()
filename = str(num)
urllib.urlretrieve(src_url, filename=filename)
txt = open(filename, "r")
for line in txt:
index = line.find(".pdf")
index_ppt = line.find(".ppt")
if index != -1 or index_ppt != -1:
lists = line.split("\"")
rets = [list for list in lists if list.find(".pdf") != -1 or list.find(".ppt") != -1]
for ret in rets:
# for relative path, combine it with previous path
if line.find("http:") == -1:
fetch_url = url + str(ret)
name = str(ret).split("/")[-1]
if fetch_url.find("<") == -1 or fetch_url.find(">") == -1:
print fetch_url + " ---> " + name
urllib.urlretrieve(fetch_url, filename=name)
# for absolute path, just use it
else:
name = str(ret).split("/")[-1]
# fix bugs: ret may not contain "http", just with "ppt/pdf"(line has but ret not)
if ret.find("http:") != -1:
print ret + " ---> " + name
urllib.urlretrieve(ret, filename=name)
else:
if line.find("http:") != -1 and level != 0: # there is an url
lists = line.split("\"")
rets = [list for list in lists if list.find(".htm") != -1 or list.find(".html") != -1]
for ret in rets:
if ret.find("http:") != -1:
print "Now there is a new url, fetch it:%s"%ret
fetch(ret, 0)
### This needs to make sure again and again!
url="http://cs224d.stanford.edu/"
print "NOTE: add url webpage and make sure the fetch url is correct, especially the base url!"
src_url = argv[1]
level = argv[2]
print "source webpage: " + src_url
fetch(src_url, level)