#!/usr/bin/perl # Usage : perl surfElt.pl delicious.stdout delicious # Génère un fichier delicious.elementToFetch correspondant à un surf sans cache suivant le chemin défini dans delicious.stdout use Thread::Queue; use HTML::TreeBuilder; use LWP::UserAgent; use URI; my $urlqueue = Thread::Queue->new; my $resultlock : shared; my $outprefix = (delete $ARGV[1] or $$); open my $userpathfile, "$ARGV[0]"; open my $results, ">", "$outprefix.elementToFetch" or die $!; my $stdout = select $results; $| = 1; select $stdout; # record a line to the $results file, locking the access to $results sub record { { lock $resultlock; print $results "$_\n" for @_; } } # canon($a, $b) makes an absolute URL to $a, with the base url from $b. todo: # get rid of fragments at the end of $a. sub canon { my ($addr, $base) = (shift, shift); my $uri = URI->new_abs($addr, $base); $uri->fragment(undef); return $uri->as_string; } # if the url can't be downloaded, return undef. otherwise, return the content # of the page. meanwhile, writes in $results the time it took. sub geturl { my $ua = shift; my $url = shift; $resp = $ua->get($url); my @status; push @status, "# $url"; push @status, "# while getting $url got $@" if $@; if ($resp->is_error) { push @status, "# while getting $url: ". $resp->status_line; lock %error_count; $error_count{$resp->status_line}++; } push @status, "$time $elapsed"; record @status; if ($@) { print $@ if $debug; return undef; } else { return $resp unless not defined wantarray; } } # download a page and all its elements (those in , , #