#! /usr/bin/perl -w
use HTTP::Request;
use HTTP::Response;
use Digest::MD5 qw(md5_hex);
use HTML::LinkExtor;
use HTML::TokeParser;
use URI;
use Data::Dumper;
use IO::Handle;

use strict;

# This program takes a mangle proxy created log directory, containing
# the log file, and the associated -fromclient and -fromserver files
# and looks for vulnerabilities. It extracts comments, scripts and 
# FORMs from received HTML, and writes them to new files in the logdir
# It is careful to only write "new" scripts, or fragments, etc, so as
# to reduce the review load. The intention is to limit the effort
# required to review a site. The output from this tool is then used
# by the GUI, to present the results, and assist in deciding where to
# investigate further.
#
# The output of analyse is also used by the VulnXML engine, to "recognise"
# new events on which to trigger tests.

# Usage is similar to:
#
# ./analyse.pl ../logdir/ &
# tail -f ../logdir/analyse # to see what is happening
# 
# FUTURE:
# ./vulnxml ../logdir http://target:80/
# to limit tests to only the target server.


my $logdir=$ARGV[0];

if (defined $logdir) {
  if ( substr($logdir,-1) ne '/') {
    $logdir.='/'; 
  }
} else {
  die "You must supply a directory to read the input data from!\n";
}
if ( ! -d $logdir ) {
  die "Directory '".$logdir."' does not exist\n"; 
}
if ( ! -f $logdir.'log' ) { 
  die "Logfile '".$logdir."log' does not exist\n"; 
}
my $inlog;
my $outlog;

open($inlog,"<${logdir}log") || die "Couldn't open log in $logdir: $!\n";

my %md5sums=();
my %urls=();
my %vulnxml=();
my %hostinfo=();

open($outlog,">${logdir}analyse") || die "Couldn't write to log in $logdir: $!\n";

$outlog->autoflush(1);

# Set up the signal handler to allow us to dump unfollowed URLs
$SIG{INT} = \&dumpunseen;

tail_log($inlog);

close($outlog);
close($inlog);
exit;




sub tail_log {
  my $fh=shift;
  my ($bytes,$i,$got)=(0,0,'');
  my $in='';

  my $rin = '';
  vec($rin,fileno($fh),1)=1;
  while (1) {
    my $r=select($rin, undef, undef, 1);
    $bytes=sysread($fh,$got,2048);
    if ($bytes) {
      $in.=$got;
      $got='';
      while ((my $i=index($in,"\n"))>-1) {
        my $line=substr($in,0,$i+1,'');
        addline($line);
      }
    }
    sleep(1);
  }
}


# Reads a line from the mangle log file, and processes it
sub addline {
  my $line=shift;
  my $methods='GET|HEAD|POST|PUT|DELETE|TRACE|CONNECT';

  if ($line=~/^\s*(\d+)\s+\d+\s+\#\s+Content\:\s+(.*)$/) {
    # Client sent POST data
    # We don't care, YET!
    return;
  }
  if ($line=~/^\s*(\d+)\s+\d+\s+\#\s+Cookie :\s+(.*)$/) {
    # Client sent a Cookie
    # We don't care
    return;
  }
  if ($line=~/^\s*(\d+)\s+\d+\s+\#\s+Set-Cookie:\s+(.*)$/) {
    # Server returned "Set-Cookie"
    # We don't care, yet!
    return;
  }
  if ($line=~/^\s*(\d+)\s+\d+\s+\#\s+MD5 ([[:xdigit:]]+)$/) {
    # proxy calculated MD5 sum
    # This is probably not really useful . . . 
    $md5sums{$2}="response ".$1;
    return;
  }
  if ($line=~/^\s*(\d+)\s+\d+\s+\#\s+(\d\d\d)\s+.*$/) {
    # Server sent response code, the files are there for reading
    my ($pid, $code) = ($1, $2);
    return if ($code == 304);
    my $response=read_conversation($pid);
    events($pid,$response->request->uri,$response);
    analyse($pid,$response);
    return;
  }
  if ($line=~/^\s*(\d+)\s+\d+\s+\#\s+($methods)\s+(.*)$/) {
    my $uri=URI->new($3);
    if (!defined $uri) { 
      print STDERR "Can't get a URI object from '$3'\n";
      return;
    }
    my $scheme=$uri->scheme;
    my $host=lc($uri->host);
    my $port=$uri->port;
    my $path=uri_path($uri,"query");
    if (!$path) { $path='/'; }
    # This shows which URLs have been requested, and which have not been
    # returned at this point.
    $urls{$scheme."://".$host.":".$port.$path}=1;
    return;
  }
  print STDERR "What to do with '$line'\n";
  return;
}

# Given a sequence number, reads in the relevant HTTP conversation
# and returns a Perl HTTP::Response object, with the Request embedded
sub read_conversation {
  local *IN;
  my $pid=shift;

  my $oldsep=$/;
  undef $/;

  open(IN,"<${logdir}log-$pid-fromclient") || 
    die "Couldn't read from $pid: $!\n";
  my $string=<IN>;
  my $request=getrequest($string);
  close(IN);

  open(IN,"<${logdir}log-$pid-fromserver") || 
    die "Couldn't read from $pid: $!\n";
  $string=<IN>;
  my $response=getresponse($string);
  close(IN);

  $response->request($request);
  $/=$oldsep;
  return $response;
}

# Given a sequence number, a URI, and an optional Response
# generates appropriate VulnXML events.
# The Response headers are used to supply information about the host
# in question, to refine the VulnXML processing
sub events {
  my $pid=shift;
  my $uri=shift;
  my $response=shift;
  
  my $url=$uri->scheme."://".lc($uri->host).":".$uri->port."/";
  # we try to get it in before the event, so that any VulnXML
  # queued can be compared against the target's hostinfo
  if (defined $response && !exists $hostinfo{$url}) {
    # FIXME I'm sure we can set more params like this?
    if ($response->header("server")) {
      $hostinfo{$url}=[ WebServer => $response->header("server") ];
      Log($pid,"INFO WebServer $url ".$response->header("server"));
    }
  }
  if (!exists $vulnxml{$url}) { 
    $vulnxml{$url}=0;
    Log($pid,"vulnxml scheme_host_port $url");
  }
  if (!exists $urls{$url}) { 
    $urls{$url}=0;
  }
  my $path=substr(uri_path($uri,"query"),1);
  if ($path ne '/' && $path ne '') {
    my @dirs=split('/',$path);
    my $file=pop @dirs;
    foreach my $dir (@dirs) {
      $url.=$dir.'/';
      if (!exists $vulnxml{$url}) {
        $vulnxml{$url}=0;
        Log($pid,"vulnxml path $url");
      }
      if (!exists $urls{$url}) {
        $urls{$url}=0;
      }
    }
    $url.=$file;
    if (!exists $vulnxml{$url}) {
      $vulnxml{$url}=0;
      Log($pid,"vulnxml file $url");
    }
    if (!exists $urls{$url}) {
      $urls{$url}=0;
    }
  }
}

# returns a Perl HTTP::Request object containing parsed string
sub getrequest { 
  my $in=shift;

  my $request = new HTTP::Request;

  my $end=0;
  my $i=index($in,"\n");
  while ($i>-1 && !$end) { # We still have a line to read
    my $line=substr($in,0,$i+1,''); # extract the next line
    $line=~s/\n//g; # get rid of CR
    $line=~s/\r//g; # get rid of LF
    if (!defined $request->method) { # This is the first line
      # Split the request into method, URI and proto
      my ($method, $uri, $proto)=split(/\s+/,$line);
      $request->method($method);
      $request->uri($uri);
      $request->protocol($proto);
    } elsif ($line=~/([^:]+)\s*:\s+(.*)/) { # This is a header line
      my $header=lc($1);
      my $value=$2;
      $request->push_header($header,$value);
    } elsif ($line eq '') { 
      # We have the blank line that signals the end of the headers
      $end=1;
      if (defined $request->content_length) {
        my $content=substr($in,0,$request->content_length,'');
        if (length($content)!=$request->content_length) {
          Debug("Incorrect content in ".$request->as_string("\r\n"));
          exit(1);
        } else {
          $request->content($content);
        }
      }
    } else { # This is something I don't understand
      Debug("Exiting with strange header content: $line");
      exit(1);
    }
    $i=index($in,"\n");
  }
  return $request;
}

# Returns a Perl HTTP::Response object, from a string representation
sub getresponse { 
  my $in=shift;

  my $response = new HTTP::Response;

  my $end=0;
  my $i=index($in,"\n");
  while ($i>-1 && !$end) { # We still have a line to read
    my $line=substr($in,0,$i+1,''); # extract the next line
    $line=~s/\n//g; # get rid of CR
    $line=~s/\r//g; # get rid of LF
    if (!$response->code) { # This is the first line
      # Split the request into method, URI and proto
      my ($version, $code, $message)=($line=~/(HTTP\S+)\s(\d\d\d)\s(.*)/);
      if (defined $code) { $response->code($code); } 
      else { $response->code(500); }
      if (defined $message) { $response->message($message); }
    } elsif ($line=~/([^:]+)\s*:\s+(.*)/) { # This is a header line
      my $header=lc($1);
      my $value=$2;
      $response->push_header($header,$value);
    } elsif ($line eq '') { 
      # We have the blank line that signals the end of the headers
      $end=1;
      if (defined $response->content_length) {
        my $content=substr($in,0,$response->content_length,'');
        if (length($content)!=$response->content_length) {
          Debug("Incorrect content in ".$response->as_string("\r\n"));
          exit(1);
        } else {
          $response->content($content);
        }
      } else {
        $response->content($in);
      }
    } else { # This is something I don't understand
      Debug("Exiting with strange header content: $line");
      exit(1);
    }
    $i=index($in,"\n");
  }
  return $response;
}

# This sub does the bulk of the work, in determining whether there is anything
# interesting about a particular conversation
sub analyse {
  my $pid=shift;
  my $response=shift;
  my @links=();
  my @forms=();
  my @xss=();
  my @scripts=();
  my @comments=();

  # We don't try to process image files
  return if ($response->content_type=~/image/);
  # We don't try to process style sheets
  return if ($response->content_type=~/text\/css/);
  # we don't try to analyse unchanged documents
  return if ($response->code==304);
  if ($response->content_type!~/javascript|html/) {
    print STDERR "Don't know how to analyse documents of type : '",$response->content_type,"'\n";
    return;
  }
  # report on any links in the document
  if ($response->content_type=~/html/) {
    @links=getlinks($response);
  }
  foreach my $link (@links) {
    if (! exists $urls{$link}) {
      $urls{$link}=0;
      # Generate events for all link components found.
      events($pid,URI->new($link));
    }
  }
  if ($response->content_type=~/html/) {
    @forms=getforms($response);
  }
  my $forms=1;
  foreach my $form (@forms) {
    # We try to only show results that we haven't seen before
    if ( ! exists $md5sums{md5_hex($form)}) {
      $md5sums{md5_hex($form)}=$pid."-form-".$forms;
      Record($pid."-form-".$forms,$form);
    }
    Log($pid,"Form: ".$md5sums{md5_hex($form)});
    $forms++;
  }
  if ($response->content_type=~/html/) {
    @comments=getcomments($response);
  }
  my $comments=1;
  foreach my $comment (@comments) {
    if ( ! exists $md5sums{md5_hex($comment)}) {
      $md5sums{md5_hex($comment)}=$pid."-comment-".$comments;
      Record($pid."-comment-".$comments,$comment);
    }
    Log($pid,"Comment: ".$md5sums{md5_hex($comment)});
    $comments++;
  }
  if ($response->content_type=~/html/) {
    @scripts=getscripts($response);
  } elsif ($response->content_type=~/javascript/) {
    push @scripts,$response->content;
  }
  my $scripts=1;
  foreach my $script (@scripts) {
    if ( ! exists $md5sums{md5_hex($script)}) {
      $md5sums{md5_hex($script)}=$pid."-script-".$scripts;
      Record($pid."-script-".$scripts,$script);
    }
    Log($pid,"Script: ".$md5sums{md5_hex($script)});
    $scripts++;
  }
  if ($response->content_type=~/html/) {
    @xss=getxss($response);
  }
  my $xss=1;
  foreach my $field (@xss) {
    if ( ! exists $md5sums{md5_hex($field)}) {
      $md5sums{md5_hex($field)}=$pid."-xss-".$xss;
      Record($pid."-xss-".$xss,$field);
    }
    Log($pid,"XSS: ".$md5sums{md5_hex($field)});
    $xss++;
  }
}

# returns an array containing any links in an HTML document
sub getlinks {
  my $response=shift;
  my %links=();

  my $p=HTML::LinkExtor->new();
  $p->parse($response->content);
  my $base=$response->base;
  foreach my $entry ($p->links) {
    my $tag=shift @$entry;
    shift @$entry; # skip the attr name
    foreach my $link (shift @$entry) {
      my $uri=URI->new_abs($link,$base);
      if ($uri->scheme=~/http/) {
        my $path=uri_path($uri,"query");
        if (!$path) { $path='/'; }
        $links{$uri->scheme."://".lc($uri->host).":".$uri->port.$path}=1;
      } else {
        print STDERR "Can't handle links like : ".$uri->as_string."\n";
      }
      shift @$entry; # skip the attr name
    }
  }
  return sort keys %links;
}

# returns an array of strings representing FORM password fields 
# that are non-empty
sub getpasswords {
  my $response=shift;
  my @passwords=();

  my $password='';
  my $content=$response->content;

  my $p=HTML::TokeParser->new(\$content);
  # these are the tags that can appear in a FORM
  my @tags=("form","/form" ,"input","/input", "select","/select","option","/option","textarea", "/textarea","keygen","isindex");

  while (my $token=$p->get_tag(@tags)) {
    if ($token->[0]=~/\//) {
      $password.=$token->[1]."\n";
      if ($token->[0] eq "/form") {
        push @passwords,$password;
        $password='';
      }
    } else {
      $password.=$token->[3]."\n";
    }
  }
  return @passwords;
}

# returns an array of strings representing the forms in an HTML document
sub getforms {
  my $response=shift;
  my @forms=();

  my $form='';
  my $content=$response->content;

  my $p=HTML::TokeParser->new(\$content);
  # these are the tags that can appear in a FORM
  my @tags=("form","/form" ,"input","/input", "select","/select","option","/option","textarea", "/textarea","keygen","isindex");

  while (my $token=$p->get_tag(@tags)) {
    if ($token->[0]=~/\//) {
      $form.=$token->[1]."\n";
      if ($token->[0] eq "/form") {
        push @forms,$form;
        $form='';
      }
    } else {
      $form.=$token->[3]."\n";
    }
  }
  return @forms;
}

# returns an array of comments found in an HTML document
sub getcomments {
  my $response=shift;
  my @comments=();

  my $content=$response->content;
  my $p=HTML::TokeParser->new(\$content);

  while (my $token=$p->get_token) {
    if ($token->[0] eq 'C') {
      push @comments,$token->[1];
    } 
  }
  return @comments;
}

# returns an array of scripts found in an HTML document
sub getscripts {
  my $response=shift;
  my @scripts=();

  my $content=$response->content;
  my $p=HTML::TokeParser->new(\$content);

  my $script='';

  my @tags=("script","/script");
  while (my $token=$p->get_tag(@tags)) {
    if ($token->[0]=~/\//) {
      $script.=$token->[1]."\n";
      push @scripts,$script;
      $script='';
    } else {
      $script.=$token->[3]."\n"; 
      $script.=$p->get_text("/script");
    }
  }
  return @scripts;
}

# Attempts to identify possible XSS opportunities, i.e. where input
# passed to a URL is immediately returned unchanged. Indicates where
# some tests should be done.
sub getxss {
  my $response=shift;
  my @xss=();
  my @get=();
  my @post=();
  my $uri=$response->request->uri;
  
  my $fragquery=uri_query($uri,"query");
  if ($fragquery) {
    my ($fragment,$query)=split(/\?/,$fragquery,2);
    if (!defined $query) { $query=$fragment; }
    else {
      # check for the fragment in the response
      if (length($fragment)>1 && $response->content=~/$fragment/) {
        # looks like we have a live one!
        push @xss,"FRAGMENT $fragment";
      }
    }
    # check for each variable in the request
    my @pairs=split('&',$query);
    foreach my $param (@pairs) {
      my ($variable, $value)=split('=',$param,2);
      push @get,$variable;
      if (length($value) > 1 && $response->content=~/$value/) {
        # we have a live one here
        push @xss,"PARAMETER $variable = $value";
      }
    }
  }
  # check for each variable in the body
  if ($response->request->method eq "POST") {
    my @pairs=split('&',$response->request->content);
    foreach my $param (@pairs) {
      my ($variable, $value)=split('=',$param,2);
      push @post,$variable;
      # we must quote the 'value' because it could be interpreted as a regex
      if (length($value) > 1 && $response->content=~/\Q$value/) {
        # we have a live one here
        push @xss,"PARAMETER $variable = $value";
      }
    }
  }
  my $form=$uri->scheme."://".$uri->host.":".$uri->port;
  $form.=uri_path($uri,"query")."\n";
  $form.="Query:\n";
  while (my $variable=shift @get) {
    $form.=$variable."\n";
  }
  $form.="\nPost:\n";
  while (my $variable=shift @post) {
    $form.=$variable."\n";
  }
  $form.="\n";
  for (my $i=0; $i<=$#xss; $i++) {
    $xss[$i]=$form.$xss[$i];
  }
  return @xss;
}

sub dumpunseen {
  local *OUT;

  print STDERR "Dumping list of unseen URLs to ${logdir}unseen\n";
  open(OUT,">${logdir}unseen") || die "Can't write to unseen: $!\n";
  foreach my $url (sort keys %urls) {
    print OUT $url."\n" if ($urls{$url} == 0);
    print STDERR $url."\n";
  }
  close(OUT);
  print STDERR "Done writing\nYou can press Ctrl-\\ to break out of this program";
}

# Convenience functions for handling fragments and 
# parameters i.e. /path;sessid=1?a=1

sub uri_path {#{{{
  my $uri=shift;
  my $param=shift;
  if (defined $param && $param ne "path") {
    return join("/",$uri->path_segments());
  } else {
    return $uri->path();
  }
}#}}}

sub uri_params {#{{{
  my $uri=shift;
  my $param=shift;
  if (defined $param && $param eq "param") {
    foreach my $segment ($uri->path_segments()) {
      if (ref $segment) {
        shift @$segment;
        return join(";",@$segment);
      }
    }
  } 
  return "";
}#}}}

sub uri_query {#{{{
  my $uri=shift;
  my $param=shift;
  my $query=$uri->query;
  if (defined $param && $param eq "query") {
    my $params=uri_params($uri,"param");
    if ($params) { $params.="?";} else { $params='';}
    return $params.(defined $query ? $query : '');
  } else {
    return $uri->query;
  }
}#}}}

sub Record {
  local *OUT;
  my $label=shift;
  my $string=shift;

  print STDERR "Recording to ${logdir}log-$label\n";
  open(OUT,">${logdir}log-$label") || die "Can't write to $label: $!\n";
  print OUT $string;
  close(OUT);
}

sub Log {
  my $pid=shift;
  my $message=shift;

  my $bytes=syswrite($outlog,"$pid 0 # ".$message."\n");
}


