#!/usr/bin/perl -w
use strict;

# The intention of this program is that it will be run on an ad hoc
# basis to retrieve any links that have not yet been followed.
# obviously, following a link may generate more links that have not
# been seen.

# usage: 

# set up a mangle proxy for this to run through. Most likely the main 
# one, but could be a different one too. run analyse on the results,
# then press Cntrl-C or send sigINT to cause analyse to dump its list of unseen
# urls. Filter that to taste, then pipe it to crawl.

use LWP::UserAgent;
use URI::URL;

my $ua = LWP::UserAgent->new;
$ua->env_proxy(); # pick up environment proxy settings (man LWP::UserAgent)

# read in a template to use for our requests.
# This could include cookies, proxy auth strings, user agents, etc.
my $templ='';
if (defined $ARGV[0] && -r $ARGV[0]) {
  open(TEMPL,"<$ARGV[0]") || die "Can't read template file $ARGV[0]: $!\n";
  while (my $line=<TEMPL>) { $templ.=$line; }
  close(TEMPL);
}
# Set up the request here, all we will change in the loop is the URL
my $request=getrequest($templ);
$request->method("GET");
$request->content('');
$request->remove_header("Content-Length");
$request->remove_header("Content-Type");

while (my $url=<STDIN>) {
  print "Fetching $url . . . "; 
  $request->uri($url);
  my $response=$ua->simple_request( $request );
  print $response->status_line,"\n";
}

exit;

# returns a Perl HTTP::Request object containing parsed string
sub getrequest {
  my $in=shift;

  my $request = new HTTP::Request;

  my $end=0;
  my $i=index($in,"\n");
  while ($i>-1 && !$end) { # We still have a line to read
    my $line=substr($in,0,$i+1,''); # extract the next line
    $line=~s/\n//g; # get rid of CR
    $line=~s/\r//g; # get rid of LF
    if (!defined $request->method) { # This is the first line
      # Split the request into method, URI and proto
      my ($method, $uri, $proto)=split(/\s+/,$line);
      $request->method($method);
      $request->uri($uri);
      $request->protocol($proto);
    } elsif ($line=~/([^:]+)\s*:\s+(.*)/) { # This is a header line
      my $header=lc($1);
      my $value=$2;
      $request->push_header($header,$value);
    } elsif ($line eq '') {
      # We have the blank line that signals the end of the headers
      $end=1;
      if (defined $request->content_length) {
        my $content=substr($in,0,$request->content_length,'');
        if (length($content)!=$request->content_length) {
          Debug("Incorrect content in ".$request->as_string("\r\n"));
          exit(1);
        } else {
          $request->content($content);
        }
      }
    } else { # This is something I don't understand
      Debug("Exiting with strange header content: $line");
      exit(1);
    }
    $i=index($in,"\n");
  }
  return $request;
}

