#!/usr/bin/perl
# Usage : perl surfElt.pl delicious.stdout delicious
# Génère un fichier delicious.elementToFetch correspondant à un surf sans cache suivant le chemin défini dans delicious.stdout
use Thread::Queue;
use HTML::TreeBuilder;
use LWP::UserAgent;
use URI;
my $urlqueue = Thread::Queue->new;
my $resultlock : shared;
my $outprefix = (delete $ARGV[1] or $$);
open my $userpathfile, "$ARGV[0]";
open my $results, ">", "$outprefix.elementToFetch" or die $!;
my $stdout = select $results; $| = 1; select $stdout;
# record a line to the $results file, locking the access to $results
sub record {
{ lock $resultlock;
print $results "$_\n" for @_;
}
}
# canon($a, $b) makes an absolute URL to $a, with the base url from $b. todo:
# get rid of fragments at the end of $a.
sub canon {
my ($addr, $base) = (shift, shift);
my $uri = URI->new_abs($addr, $base);
$uri->fragment(undef);
return $uri->as_string;
}
# if the url can't be downloaded, return undef. otherwise, return the content
# of the page. meanwhile, writes in $results the time it took.
sub geturl {
my $ua = shift;
my $url = shift;
$resp = $ua->get($url);
my @status;
push @status, "# $url";
push @status, "# while getting $url got $@" if $@;
if ($resp->is_error) {
push @status, "# while getting $url: ". $resp->status_line;
lock %error_count;
$error_count{$resp->status_line}++;
}
push @status, "$time $elapsed";
record @status;
if ($@) {
print $@ if $debug;
return undef;
} else {
return $resp unless not defined wantarray;
}
}
# download a page and all its elements (those in , ,
#