File indexing completed on 2024-05-12 16:28:27
0001 #! /usr/bin/env perl 0002 use warnings; 0003 use strict; 0004 use File::Path; 0005 use LWP::UserAgent; 0006 use HTML::LinkExtor; 0007 use URI; 0008 use URI::URL; 0009 use URI::Escape; 0010 0011 # This script downloads MS Office files for you. You should provide a search 0012 # term which is used to get a list of document. 0013 # To download 20 presentations about pears do: 0014 # downloadMSOfficeDocuments.pl 20 pear ppt 0015 # 0016 # Copyright 2009 Jos van den Oever <jos@vandenoever.info> 0017 0018 if ($#ARGV != 2 || $ARGV[0] !~ m/^\d+$/ || $ARGV[1] !~ m/^\w+$/) { 0019 die "Usage: $0 number term type\n"; 0020 } 0021 0022 my $maxresults = $ARGV[0]; 0023 my $term = $ARGV[1]; 0024 my $type = $ARGV[2]; 0025 my $maxjobs = 10; 0026 0027 my %mimetypes = ( 0028 "ppt", "application/vnd.ms-powerpoint", 0029 "pdf", "application/pdf", 0030 "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation" , 0031 "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 0032 "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 0033 "doc", "application/msword", 0034 "xls", "application/vnd.ms-excel", 0035 "rtf", "application/rtf", 0036 "ods", "application/vnd.oasis.opendocument.spreadsheet", 0037 "odt", "application/vnd.oasis.opendocument.text", 0038 "odp", "application/vnd.oasis.opendocument.presentation" 0039 ); 0040 0041 if (!defined $mimetypes{$type}) { 0042 die "Unknown type '$type'.\n"; 0043 } 0044 my $mimetype = $mimetypes{$type}; 0045 0046 #used to dispatch web requests 0047 my $ua = LWP::UserAgent->new; 0048 $ua->timeout(10); # seconds 0049 $ua->env_proxy; 0050 my $agentstring = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'; 0051 $ua->agent($agentstring); 0052 my @pages; 0053 sub callback { 0054 my($tag, %attr) = @_; 0055 # we only look closer at <a ...> 0056 return if $tag ne 'a'; 0057 push(@pages, values %attr); 0058 } 0059 #extracts links from an HTML document 0060 my $p = HTML::LinkExtor->new(\&callback); 0061 0062 my $runningjobs = 0; 0063 sub startJob { 0064 0065 #Shifts the first value of the array off and returns it, shortening the array by 1 and moving everything down. 0066 my $uri = shift; 0067 if ($runningjobs >= $maxjobs) { 0068 wait; 0069 $runningjobs--; 0070 } 0071 $runningjobs++; 0072 if (!fork()) { 0073 my $localuri = $uri; 0074 my $localua = LWP::UserAgent->new; 0075 $localua->timeout(10); # seconds 0076 $localua->env_proxy; 0077 $localua->agent($agentstring); 0078 my $res = $localua->request(HTTP::Request->new(HEAD => $localuri)); 0079 if ($res->content_type() eq $mimetype) { 0080 my $filename = uri_unescape($localuri); 0081 $filename =~ s#^http://##; 0082 $filename = uri_escape($filename, '/:\!&*$?;:= \'"'); 0083 print $localuri."\n"; 0084 $ua->get($localuri, ':content_file' => $filename); 0085 } 0086 exit; 0087 } 0088 } 0089 0090 my @jobs; 0091 my %done; 0092 sub addJob { 0093 #Shifts the first value of the array off and returns it, shortening the array by 1 and moving everything down. 0094 my $uri = shift; 0095 my $scheme = $uri->scheme; 0096 if (exists $done{$uri} || ($scheme ne "http" && $scheme ne "https")) { 0097 return; 0098 } 0099 $done{$uri} = 1; 0100 push @jobs, $uri; 0101 while (@jobs && $runningjobs < $maxjobs) { 0102 startJob(pop @jobs); 0103 #sleep 1; 0104 } 0105 } 0106 0107 for (my $start=0; $start < $maxresults; $start = $start + 100) { 0108 if ($start > 0) { 0109 sleep 3; # do not query search engine too often 0110 } 0111 @pages = (); 0112 my $base = "http://www.google.com/"; 0113 my $url = $base . "search?q=$term+filetype:$type&start=$start&num=100"; 0114 my $res = $ua->request(HTTP::Request->new(GET => $url), sub {$p->parse($_[0])}); 0115 foreach (@pages) { 0116 my $uri = URI->new_abs($_, $base); 0117 if ($uri->host =~ m/google/) { 0118 my @q = $uri->query_form; 0119 if (!@q) { 0120 next; 0121 } 0122 for (my $i = 0; $i <= @q; $i++) { 0123 $uri = URI->new_abs($q[$i], $base); 0124 if (($uri->scheme eq "http" || 0125 $uri->scheme eq "https") && 0126 $uri->host !~ m/google/) { 0127 addJob($uri); 0128 } 0129 } 0130 } else { 0131 addJob($uri); 0132 } 0133 } 0134 } 0135 0136 # keep the queue filled 0137 while (@jobs) { 0138 startJob(pop @jobs); 0139 } 0140 0141 # wait for jobs to finish 0142 my $pid; 0143 do { 0144 $pid = wait; 0145 } while ($pid != -1);