File indexing completed on 2024-05-12 16:28:27

0001 #! /usr/bin/env perl
0002 use warnings;
0003 use strict;
0004 use File::Path;
0005 use LWP::UserAgent;
0006 use HTML::LinkExtor;
0007 use URI;
0008 use URI::URL;
0009 use URI::Escape;
0010 
0011 # This script downloads MS Office files for you. You should provide a search
0012 # term which is used to get a list of document.
0013 # To download 20 presentations about pears do:
0014 #    downloadMSOfficeDocuments.pl 20 pear ppt
0015 #
0016 # Copyright 2009 Jos van den Oever <jos@vandenoever.info>
0017 
0018 if ($#ARGV != 2 || $ARGV[0] !~ m/^\d+$/ || $ARGV[1] !~ m/^\w+$/) {
0019     die "Usage: $0 number term type\n";
0020 }
0021 
0022 my $maxresults = $ARGV[0];
0023 my $term = $ARGV[1];
0024 my $type = $ARGV[2];
0025 my $maxjobs = 10;
0026 
0027 my %mimetypes = (
0028     "ppt", "application/vnd.ms-powerpoint",
0029     "pdf", "application/pdf",
0030         "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation" ,
0031         "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
0032     "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
0033     "doc", "application/msword",
0034     "xls", "application/vnd.ms-excel",
0035     "rtf", "application/rtf",
0036         "ods", "application/vnd.oasis.opendocument.spreadsheet",
0037         "odt", "application/vnd.oasis.opendocument.text",
0038         "odp", "application/vnd.oasis.opendocument.presentation"
0039 );
0040 
0041 if (!defined $mimetypes{$type}) {
0042     die "Unknown type '$type'.\n";
0043 }
0044 my $mimetype = $mimetypes{$type};
0045 
0046 #used to dispatch web requests
0047 my $ua = LWP::UserAgent->new;
0048 $ua->timeout(10); # seconds
0049 $ua->env_proxy;
0050 my $agentstring = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
0051 $ua->agent($agentstring);
0052 my @pages;
0053 sub callback {
0054     my($tag, %attr) = @_;
0055     # we only look closer at <a ...>
0056     return if $tag ne 'a';
0057     push(@pages, values %attr);
0058 }
0059 #extracts links from an HTML document
0060 my $p = HTML::LinkExtor->new(\&callback);
0061 
0062 my $runningjobs = 0;
0063 sub startJob {
0064 
0065 #Shifts the first value of the array off and returns it, shortening the array by 1 and moving everything down. 
0066     my $uri = shift;
0067         if ($runningjobs >= $maxjobs) {
0068         wait;
0069         $runningjobs--;
0070     }
0071     $runningjobs++;
0072     if (!fork()) {
0073         my $localuri = $uri;
0074         my $localua = LWP::UserAgent->new;
0075         $localua->timeout(10); # seconds
0076         $localua->env_proxy;
0077         $localua->agent($agentstring);
0078         my $res = $localua->request(HTTP::Request->new(HEAD => $localuri));
0079         if ($res->content_type() eq $mimetype) {
0080             my $filename = uri_unescape($localuri);
0081             $filename =~ s#^http://##;
0082             $filename = uri_escape($filename, '/:\!&*$?;:= \'"');
0083                     print $localuri."\n";
0084             $ua->get($localuri, ':content_file'  => $filename);
0085         }
0086         exit;
0087     }
0088 }
0089 
0090 my @jobs;
0091 my %done;
0092 sub addJob {
0093 #Shifts the first value of the array off and returns it, shortening the array by 1 and moving everything down. 
0094     my $uri = shift;
0095     my $scheme = $uri->scheme;
0096     if (exists $done{$uri} || ($scheme ne "http" && $scheme ne "https")) {
0097         return;
0098     }
0099     $done{$uri} = 1;
0100         push @jobs, $uri;
0101         while (@jobs && $runningjobs < $maxjobs) {
0102         startJob(pop @jobs);
0103         #sleep 1;
0104     }
0105 }
0106 
0107 for (my $start=0; $start < $maxresults; $start = $start + 100) {
0108     if ($start > 0) {
0109         sleep 3; # do not query search engine too often
0110     }
0111     @pages = ();
0112     my $base = "http://www.google.com/";
0113     my $url = $base . "search?q=$term+filetype:$type&start=$start&num=100";
0114     my $res = $ua->request(HTTP::Request->new(GET => $url), sub {$p->parse($_[0])});
0115     foreach (@pages) {
0116         my $uri = URI->new_abs($_, $base);
0117         if ($uri->host =~ m/google/) {
0118             my @q = $uri->query_form;
0119             if (!@q) {
0120                 next;
0121             }
0122             for (my $i = 0; $i <= @q; $i++) {
0123                 $uri = URI->new_abs($q[$i], $base);
0124                 if (($uri->scheme eq "http" ||
0125                         $uri->scheme eq "https") &&
0126                         $uri->host !~ m/google/) {
0127                     addJob($uri);
0128                 }
0129             }
0130         } else {
0131             addJob($uri);
0132         }
0133     }
0134 }
0135 
0136 # keep the queue filled
0137 while (@jobs) {
0138     startJob(pop @jobs);
0139 }
0140 
0141 # wait for jobs to finish
0142 my $pid;
0143 do {
0144     $pid = wait;
0145 } while ($pid != -1);