#! /usr/bin/perl -w
# Network Forensics Puzzle Contest #3
# Alan Tu <alantu@as2.info>
# January 2, 2010

# http_analysis.pl v1.01
# Uses tshark to output the IP addresses, TCP ports, and key HTTP request and response headers from a PCAP file.
# Usage: http_analysis.pl [-d tcp.port] [-R display_filter] pcap_file
# Prints out HTTP information from the specified PCAP
# -R passes additional display filter arguments to tshark via the -R switch
# -d tells tshark (via -d tcp.port==PORT,http) to decode traffic to the specified TCP port as http

# Courtesy functionality:
# http_analysis.pl -w tcp.stream pcap_file output_pcap_file
# Outputs the TCP stream specified by the tcp.stream index to its own PCAP file

use strict;
use Getopt::Std;

# path to tshark executable, modify for specific environment
#my $TSHARK = 'c:\progra~1\wireshark\tshark.exe';
my $TSHARK = "/usr/bin/tshark";
my $delim = ", "; # delimiter for output, default is CSV compatible

our($opt_d, $opt_R, $opt_w); # options
getopts("d:R:w:");
die "Cannot find tshark at $TSHARK, please check \$TSHARK variable in $0\n" unless -f $TSHARK;
die "Usage: $0 [-w tcp.stream] [-d tcp.port] [-R display_filter] pcap_file\n" unless @ARGV >= 1;
die "File $ARGV[0] does not exist\n" unless -f $ARGV[0];

if (defined($opt_w)) # write a PCAP with the TCP session specified by tcp.stream index
{
    die "Usage: $0 -w tcp.stream pcap_file output_pcap_file\n" unless defined($ARGV[1]);
    `$TSHARK -r $ARGV[0] -w $ARGV[1] -R \"tcp.stream == $opt_w\"`;
}
else
{
    my $args = "-R \"(http.request == 1 or http.response == 1)"; # basic arguments for HTTP processing
    if (defined($opt_R)) # passes additional filters to tshark with -R
    {
        $args .= " && $opt_R";
    }
    $args .= "\" "; # close the -R argument regardless

    if (defined($opt_d)) # passes -d tcp.port==PORT,http to tshark
    {
        $args .= "-d tcp.port==$opt_d,http ";
    }

    # construct the command
    $args .= "-T fields -e http.request -e http.response -e eth.src -e frame.number -e frame.time -e tcp.stream -e ip.src -e tcp.srcport -e ip.dst -e tcp.dstport -e http.request.method -e http.request.uri -e http.host -e http.user_agent -e http.response.code -e http.content_type -e http.content_encoding -e http.content_length -r $ARGV[0]";
    my @http = `$TSHARK $args`; # get the information
    # column heading
    my @heading = qw/frame.number frame.time tcp.stream ip.src tcp.srcport ip.dst tcp.dstport http.request.method http.request.uri http.host http.user_agent http.response.code http.content_type http.content_encoding http.content_length/;
    print join($delim, @heading) . "\n";
    my %sources; # tracks sources of HTTP requests
    for (@http) # for each HTTP request or response
    {
        chomp;
        my @fields = split("\t", $_);
        $fields[4] = convert_tshark_time($fields[4]); # convert frame.time to nicer format
        print join($delim, @fields[3..$#fields]) . "\n"; # output
        $sources{$fields[2] . $delim . $fields[6] . $delim . $fields[13]}++ if $fields[0] eq 1; # track request sources
    }

    print "\nSummary of sources:\n";
    my @sources = sort {$sources{$b} <=> $sources{$a}} keys %sources; # print in descending order by number of requests
    print join($delim, qw/eth.src ip.src http.user_agent http.request/) . "\n"; # heading
    print join($delim, $_, $sources{$_}) . "\n" for @sources;
}

# converts Wireshark's frame.time field into yyyy/mm/dd hh:mm:ss.ss
sub convert_tshark_time
{
    my %months = (Jan => "01", Feb => "02", Mar => "03", Apr => "04", May => "05", Jun => "06", Jul => "07", Aug => "08", Sep => "09", Oct => "10", Nov => "11", Dec => "12");
    my $t = shift; # argument to convert
    my($mon, $day, $year, $hhmmss) = $t =~ /^(...)\s+(\d+), (\d+) (.+)$/; # capture fields
    $mon = $months{$mon}; # convert months
    $day = sprintf("%02d", $day); # make sure day is always two digits
    return "$year/$mon/$day $hhmmss";
}
