-
Notifications
You must be signed in to change notification settings - Fork 2
/
parseSearchResultsExternal.pl
executable file
·103 lines (76 loc) · 2.15 KB
/
parseSearchResultsExternal.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/perl -w
#require 5;
use strict;
use warnings;
use HTML::Tree;
use HTML::TreeBuilder;
if (scalar(@ARGV) != 1) {
print "Usage: parseSearchResultsExternal.pl outdir\n";
print "\tGets all of the external links from doctor search results by output directory of grabSearchResults. Prints them to STDOUT\n";
exit;
}
print "ID\tResult Page\tResult Number\tLink\n";
my $infolder = $ARGV[0];
opendir(DIR, $infolder);
my @files = readdir(DIR);
closedir(DIR);
foreach my $file (@files) {
if ($file =~ m/(\w+-\d+)\.(\d+)\.html/) {
my $docId = $1;
my $page = $2;
my @googleLinks = getSubLinks("$infolder/$file");
for (my $i = 0; $i < scalar(@googleLinks); $i++) {
my $cnt = $i + 1;
my $domain = getDomain($googleLinks[$i]);
my @links = getExternalLinks("$infolder/$docId.$page.$cnt.html", $domain);
foreach my $link (@links) {
print "$docId\t$page\t$cnt\t$link\n";
}
}
}
}
sub getDomain {
my $url = shift;
# This was wrong
#if ($url =~ /https?:\/\/[^\/]*([^\.]+\.[^\.]+)\//) {
$url =~ s#^https?://##;
my @parts = split(/\//, $url);
if ($parts[0] =~ m/([^\.]+\.[^\.]+)$/) {
return $1;
} else {
die "Couldn't get domain from $url";
}
}
sub getSubLinks {
my $filename = shift;
my $tree = HTML::Tree->new_from_file($filename);
my $googleResults = $tree->look_down('id', 'res');
my @resultHeaders = $googleResults->look_down('class', 'r');
print "result headers: " . scalar(@resultHeaders) . "\n";
my $cnt = 1;
my @res;
foreach my $resHeader (@resultHeaders) {
my @links = $resHeader->look_down('_tag', 'a');
foreach my $link (@links) {
my $url = $link->attr('href');
push (@res, $url);
}
}
return @res;
}
sub getExternalLinks {
my $filename = shift;
my $domain = shift;
my $tree = HTML::Tree->new_from_file($filename);
my @links = $tree->look_down('_tag', 'a');
my @res;
foreach my $link (@links) {
my $url = $link->attr('href');
#if ($url && $url ne "") {
if ($url && $url =~ m/http/i && $url !~ m/$domain/) {
push(@res, $url);
}
}
return @res;
}
__END__