-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgraburls.pl
executable file
·137 lines (127 loc) · 3.62 KB
/
graburls.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/perl
#use strict;
use Getopt::Std;
use LWP::Simple;
use HTML::Parser;
sub catch_zap {
# print "Caught a ctrl-c will quit after next fetching.\n";
# $done = 1;
}
$SIG{INT} = \&catch_zap;
sub validateUrl {
my($strUrl) = shift;
return
$strUrl =~ m!(http:|https:|ftp:)//([A-z\d]+)\:([A-z\d]+)\@([A-z\d\-\.]+\.)+[A-z]!i ||
$strUrl =~ m!^(http:|https:|ftp:)//([A-z\d\-\.]+\.)+[A-z]!i ||
$strUrl=~ m!^(http:|https:|ftp:)//(\d){1,3}\.(\d){1,3}\.(\d){1,3}\.(\d){1,3}!i ? 1 : 0;
}
#
# Grab all links from local or remote html file
# perl html munging
#
# option -a (/ -r) grabs only absolute (/ relative) urls
# get options and argument
#
my %opts;
getopts('ard:', \%opts);
my $arg = shift;
die "Usage: $0 [-d depth] [-a | -r] filename [| URL]\n"
if (not defined $arg or $opts{a} && $opts{r}); # allow either -a or -r
# get the page either from file or url
#
my $page;
if ($arg =~ m!^http://!) {
$page = get($arg)
or die "Couldn't get $arg: $!\n";
}
else {
open FH, "<", $arg
or die "Couldn't open $arg: $!\n";
$page = do { local $/; <FH> };
close FH;
}
# set the parser and parse
#
my $parser = HTML::Parser->new( api_version => 3, start_h => [\&start,"tagname, attr"],);
my @links;
sub start {
my ($tag, $attr) = @_;
if ($tag =~ /^a$/ and defined $attr->{href}) {
return
if ($attr->{href} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
return
if ($attr->{href} !~ m!http://! and $opts{a}); # exclude relative url when -a
push @links, $attr->{href};
}
if ($tag =~ /^embed$/ and defined $attr->{src}) {
return
if ($attr->{src} =~ m!^http://! and $opts{r}); # exclude absolute url when -r
return
if ($attr->{src} !~ m!http://! and $opts{a}); # exclude relative url when -a
push @links, $attr->{src};
}
}
$parser->parse($page);
$parser->eof;
#print "Option -d was set to:".$opts{d}."\n";
$depth = $opts{d};
open (OUT,">>urls.txt");
# output
#
#$dbh = Mysql->connect($host,$user,$passwd,$dbname);
#map {$sth = $dbh->query("insert into crawlurl values ('','$_','$depth','2001-01-01 00:00:00','','','')")} @links;
foreach (@links) {
$mylink = $_;
$ret = validateUrl($mylink);
if ($ret) {
if ($mylink =~ m/#$/i) {
chop($mylink);
}
if ($mylink =~ m/\/$/i) {
chop($mylink);
}
if ($mylink =~ m/^(.*)\#(.*)$/i) {
$line = $1;
$mylink = $line;
}
if ($mylink =~ m/\.swf$/i) {
open (SWF,">>swf.txt");
print SWF ("$mylink\n");
close SWF;
# print OUT "$mylink\n";
# } elsif ($mylink =~ m/download/i) {
# $sth = $dbh->query("insert into applog values (now(),'skipped $mylink')");
} elsif ($mylink =~ m/^javascript/i) {
} elsif ($mylink =~ m/download\.jsp/i) {
} elsif (!($mylink =~ m/^http/i)) {
} elsif ($mylink =~ m/download\.php/i) {
} elsif ($mylink =~ m/\.collegeclown\.com/i) {
} elsif ($mylink =~ m/\.zip$/i) {
} elsif ($mylink =~ m/\.wmv$/i) {
} elsif ($mylink =~ m/\.mov$/i) {
} elsif ($mylink =~ m/\.pdf$/i) {
} elsif ($mylink =~ m/\.dmg$/i) {
} elsif ($mylink =~ m/\.mp3$/i) {
} elsif ($mylink =~ m/\.mp4$/i) {
} elsif ($mylink =~ m/\.flv$/i) {
} elsif ($mylink =~ m/\.rar$/i) {
} elsif ($mylink =~ m/\.000$/i) {
} elsif ($mylink =~ m/\.xml$/i) {
} elsif ($mylink =~ m/\.svg$/i) {
} elsif ($mylink =~ m/\.wav$/i) {
} elsif ($mylink =~ m/\.iso$/i) {
} elsif ($mylink =~ m/\.exe$/i) {
} elsif ($mylink =~ m/\.dir$/i) {
} elsif ($mylink =~ m/\.dcr$/i) {
} elsif ($mylink =~ m/\.jpg$/i) {
} elsif ($mylink =~ m/\.gif$/i) {
} elsif ($mylink =~ m/\.png$/i) {
} elsif ($mylink =~ m/\.avi$/i) {
} elsif ($mylink =~ m/^mailto/i) {
} elsif ($mylink =~ m/\/1.htm$/i) {
} else {
print OUT "$mylink|$depth\n";
}
}
}
close OUT;