#! perl -w
# Network Forensics Puzzle Contest #2
# Alan Tu <alantu@as2.info>
# November 4, 2009

# stream.pl v1.31
# Uses tshark to identify or reassemble TCP streams.
# Advantages:
# 1.  Only requires tshark 1.2.0 or later, and standard Perl modules.
# 2.  Reassembles streams with duplicate, missing, out of order, or overlapping TCP segments.

# Usage: stream.pl [-R display_filter] pcap_file
# Prints out information about each identified TCP stream from pcap_file, including stream number, time, IPs, ports, and application protocol.
# -R optionally specifies a tshark display filter

# stream.pl -s tcp.stream pcap_file
# Outputs the application-layer data of the TCP stream identified by stream index tcp.stream from pcap_file
# Each direction is output to a file dependent on the four-tupple

# stream.pl -w tcp.stream pcap_file output_pcap_file
# Outputs the TCP stream to its own PCAP file

use strict;
use Getopt::Std;
use Digest::MD5;

# path to tshark executable
my $TSHARK = 'c:\progra~1\wireshark\tshark.exe';
# path to disabled_protos, will be backed up and restored if it exists
my $WIRESHARK_DISABLED_PROTOS = $ENV{APPDATA} . '\wireshark\disabled_protos';
# on unix, typical location is $HOME/.wireshark/disabled_protos

our($opt_s, $opt_R, $opt_w);
getopts("s:R:w:");
die "Usage: $0 [-s|-w tcp.stream] [-R display_filter] pcap_file\n" unless @ARGV >= 1;
die "File $ARGV[0] does not exist\n" unless -f $ARGV[0];
die "Cannot find tshark\n" unless -f $TSHARK;

if (defined($opt_s)) # extract a specific TCP stream
{
    my(@packets, $subseq, $ren, $backup_file, %flows, $ip_a, $ip_b, $fn_a, $fn_b, %streams);

    # process packets
    RESTART: %flows = ();
    @packets = `$TSHARK -r $ARGV[0] -T fields -e frame.protocols -e ip.src -e tcp.srcport -e ip.dst -e tcp.dstport -e tcp.seq -e data.data -R \"tcp.stream == $opt_s && tcp.len > 0\"`;
    unless (@packets)
    {
        print STDERR "No data available in stream $opt_s\n";
        exit;
    }

    for (@packets)
    {
        chomp;
        my($proto, $src_ip, $src_port, $dst_ip, $dst_port, $seq, $data) = split("\t", $_);

        # determine what tshark thinks the application layer protocol is so we can disable that dissector
        ($proto) = $proto =~ /:tcp:([^:]+)/;
        if (defined($proto) && $proto ne "data")
        {
            # we disable the dissectors so tshark outputs data.data, the data of the TCP segment
            # if this is the first time and disabled_protos already exists, back it up
            if (!$subseq && -e $WIRESHARK_DISABLED_PROTOS)
            {
                $ren = 1;
                $subseq = 1;
                $backup_file = $WIRESHARK_DISABLED_PROTOS . "." . time();
                my $status = rename($WIRESHARK_DISABLED_PROTOS, $backup_file);
                die "Cannot backup disabled_protos\n" unless $status;
            }
            elsif (!$subseq)
            {
                $subseq = 1;
            }

            # append to disabled_protos file to disable the dissector
            open(OUT, ">>$WIRESHARK_DISABLED_PROTOS") or die "$!\n";
            print STDERR "Disabling dissector $proto \n";
            print OUT "$proto\n";
            close(OUT);
            goto RESTART;
        }

        unless (defined($ip_a) && defined($ip_b)) # on first run, determine IP addresses for this stream
        {
            $ip_a = $src_ip;
            $ip_b = $dst_ip;
            $fn_a = sprintf("%s.%s-%s.%s", $src_ip, $src_port, $dst_ip, $dst_port);
            $fn_b = sprintf("%s.%s-%s.%s", $dst_ip, $dst_port, $src_ip, $src_port);
        }

        # disregard packet if its a duplicate and contains no new TCP data
        next if exists($flows{$src_ip}->{$seq}) && (length($data)+1) / 3 < length($flows{$src_ip}->{$seq});
        $data =~ s/://g; # remove the colons from data.data
        $flows{$src_ip}->{$seq} = pack("H*", $data); # convert hex to binary
    }

    for my $k (keys %flows) # for each of the two directions
    {
        my(@seq, $expected);
        @seq = sort {$a <=> $b} keys %{$flows{$k}}; # sort data by sequence number
        $expected = $seq[0];
        $streams{$k} = ""; # the ordered TCP bytestream
        for (@seq)
        {
            my $missing = $_ - $expected; # are there missing/lost bytes?
            if ($missing > 0) # yes, print message for awareness
            {
                print STDERR "*** Missing packet flow from $k: offset " . ($expected - $seq[0]) . ", length $missing\n";
                $expected += $missing;
            }
            elsif ($missing < 0) # duplicate packet or overlapping data
            {
                $missing = $_ + length($flows{$k}->{$_}) - $expected; # bytes of new data
                next if $missing <= 0; # no new data, duplicate packet
                $flows{$k}->{$_} = substr($flows{$k}->{$_}, -$missing); # only keep the new data
            }

            $streams{$k} .= $flows{$k}->{$_}; # append the data to the TCP bytestream
            $expected += length($flows{$k}->{$_});
        }
    }

    if (length($streams{$ip_a}))
    {
    open(OUT, ">$fn_a") or die "$!\n"; # output bytestream in first direction
        printf(STDERR "Writing file %s, length %d bytes, MD5 hash %s\n", $fn_a, length($streams{$ip_a}), Digest::MD5::md5_hex($streams{$ip_a}));
        binmode(OUT);
        print OUT $streams{$ip_a};
        close(OUT);
    }

    if (length($streams{$ip_b}))
    {
        open(OUT, ">$fn_b") or die "$!\n"; # and the other direction
        printf(STDERR "Writing file %s, length %d bytes, MD5 hash %s\n", $fn_b, length($streams{$ip_b}), Digest::MD5::md5_hex($streams{$ip_b}));
        binmode(OUT);
        print OUT $streams{$ip_b};
        close(OUT);
    }

    unlink($WIRESHARK_DISABLED_PROTOS); # delete our copy of disabled_protos
    if ($ren) # if we backed up an existing copy, restore it
    {
        my $status = rename($backup_file, $WIRESHARK_DISABLED_PROTOS);
        die "Cannot restore disabled_protos\n" unless $status;
    }
}
elsif (defined($opt_w)) # write a specific TCP stream to its own PCAP file using tshark
{
    die "Usage: $0 -w tcp.stream pcap_file output_pcap_file\n" unless defined($ARGV[1]);
    `$TSHARK -r $ARGV[0] -w $ARGV[1] -R \"tcp.stream == $opt_w\"`;
}
else # display available TCP streams
{
    my(@packets, %streams);
    if ($opt_R) # additional display filter
    {
        $opt_R = "\"tcp && ($opt_R)\"";
    }
    else # no display filter specified
    {
        $opt_R = "\"tcp\"";
    }

    @packets = `$TSHARK -r $ARGV[0] -T fields -e tcp.stream -e frame.time -e ip.src -e tcp.srcport -e ip.dst -e tcp.dstport -e frame.protocols -R $opt_R`;
    for (@packets)
    {
        chomp;
        my($stream, $time, $src_ip, $src_port, $dst_ip, $dst_port, $proto) = split("\t", $_);
        next if $stream eq ""; # non-TCP

        # the application protocol tshark identifies
        ($proto) = $proto =~ /:tcp:([^:]+)/;
        $proto = "" unless defined($proto);
        # only note the stream if the stream ID is new or if the application protocol hasn't been determined or if the application is generic data
        $streams{$stream} = [join("\t", $time, $src_ip, $src_port, $dst_ip, $dst_port), $proto] unless exists($streams{$stream}) && $streams{$stream}->[1] ne "" && $streams{$stream}->[1] ne "data";
    }

    print "tcp.stream\tTIME\tIP\tPORT\tIP\tPORT\tAPP_PROTOCOL\n";
    for (sort {$a <=> $b} keys(%streams)) # print the information about each stream
    {
        print "$_\t$streams{$_}->[0]\t$streams{$_}->[1]\n";
    }
}
