#!/usr/bin/perl # httpparse - list and parse parts from http stream # Copyright 2009 Lou Arminio (lou arminio gmail com) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . use strict; use Getopt::Long; use Pod::Usage; use File::Basename; use URI::Escape; use IO::Uncompress::Gunzip qw(gunzip $GunzipError); use Digest::MD5; # Vars for processing my $err; my @Range; my $range_digits = 0; # Vars for input options my $stream_in = ""; my $file_out = ""; my $dir = "./"; my $dump_range = ""; my $all = 0; my $md5 = 0; my $help = 0; my $debug = 0; Getopt::Long::Configure ('no_ignore_case'); GetOptions ("read|r=s" => \$stream_in, "write|w=s" => \$file_out, "dir|D=s" => \$dir, "dump|d=s" => \$dump_range, "all|a" => \$all, "md5|m" => \$md5, "help|h" => \$help) or shorthelp(2); if ($help) { pod2usage(-verbose => 2); } # Check inputs if ($dir) { if (! -d $dir) { print "\nCreating directory \"$dir\" for output\n"; mkdir ($dir) or die "Can't create directory $dir: $!\n"; } else { print "\nUsing directory $dir for output\n"; } # Make sure $dir is terminated with path separator $dir =~ /(\/|\\)$/ or $dir .= '/'; } if ($file_out) { $file_out =~ /^[a-zA-Z0-9-_:\.]+$/ or die "Only 'a-zA-Z0-9-_:.' characters allowed in write file name\n"; } else { # Make output file same as input (will add index and content type) $file_out = (fileparse($stream_in))[0]; } if ($dump_range) { $dump_range =~ /^[0-9\-,]+$/ or die "Only digits, '-' and ',' allowed in dump range\n"; } if (! $stream_in) { shorthelp(1); } if (! -r $stream_in) { print "File does not exist: $stream_in\n"; exit 1; } if ($all) { # Dump all contents process_stream($stream_in, 'extract-all', $file_out, $dir); } elsif ($dump_range) { # split range into a list @Range = split_range($dump_range); $range_digits = index_digits($Range[(scalar @Range) -1]); $range_digits = 2 if ($range_digits < 2); process_stream($stream_in, 'extract-range', $file_out, $dir, $range_digits, @Range); } else { # List index of http stream file process_stream($stream_in, 'index'); } exit; # Subroutines # process_stream - print index of tcp streams sub process_stream { my $stream_file = shift; my $task = shift; my $file_out; my $dir; my $range_digit; my @Range; my $next_index; if ($task eq 'extract-all') { $file_out = shift; $dir = shift; } elsif ($task eq 'extract-range') { $file_out = shift; $dir = shift; $range_digit = shift; @Range = @_; $next_index = shift @Range; } my $STREAM; my $index = 0; my $in_request = 0; my $in_reply = 0; my $get_req; my $user_agent; my $cookie; my $host; my $http_head; my $encoding; my $cont_type; my $cont_major; my $cont_minor; my $encode_opts; my $etag; my $cont_length; my $data; my $DATA; my $path_out; open($STREAM, "<$stream_file") or die "Can't open $stream_file: $!\n"; while (<$STREAM>) { /^\r\n$/ and do { # Blank line - end of last section if ($in_request and $task eq 'index') { print "\n", '[', $index, '] Request:', "\n"; print ' ', $get_req, "\n"; print ' UserAgent: ', $user_agent, "\n"; print ' Cookie: ', $cookie, "\n"; print ' Host: ', $host, "\n"; print "\n"; $in_request = 0; $get_req = $user_agent = $cookie = $host = ""; } elsif ($in_reply) { if ($task eq 'index') { print '[', $index, '] Reply:', "\n"; print ' ', $http_head, "\n"; print ' Content-Encoding: ', $encoding, "\n"; print ' Content-Type: ', $cont_major, '/', $cont_minor, "\n"; print ' Options: ', $encode_opts, "\n" if ($encode_opts); print ' ETag: ', $etag, "\n" if ($etag); print ' Content-Length: ', $cont_length, "\n"; # Skip content read($STREAM, $data, $cont_length) or die "can't read bytes: $!\n"; } elsif ($task eq 'extract-all') { $path_out = sprintf ("%s%s-%02d.%s", $dir, $file_out, $index, get_suffix($cont_minor)); dump_data ($STREAM, $path_out, $cont_length, $encoding); } elsif ($task eq 'extract-range') { if ($index == $next_index) { $path_out = sprintf ("%s%s-%02d.%s", $dir, $file_out, $index, get_suffix($cont_minor)); dump_data ($STREAM, $path_out, $cont_length, $encoding); $next_index = shift @Range; last if (! $next_index); } } $http_head = $encoding = $cont_major = $cont_minor = $etag = $cont_length = ""; $in_reply = 0; } else { # extra blank line??? if ($debug) { print "Info: encountered extra blank line in stream\n"; } } }; /^(GET.*\S)/ and do { # Start of request $get_req = uri_unescape($1); $in_request = 1; $index++; }; /^Cookie: (.*\S)/ and do { $cookie = $1; }; /^User-Agent: (.*\S)/ and do { $user_agent = $1; }; /^Host: (.*\S)/ and do { $host = $1; }; /^(HTTP.*\S)/ and do { $http_head = $1; $in_reply = 1; }; /^Content-Encoding: (.*\S)/ and do { $encoding = $1; }; /^Content-Type: (.*\S)/ and do { $cont_type = $1; # Parse out Content-Type $cont_type =~ m%([^/]+)/(\S*)\s*(.*)%; $cont_major = $1; $cont_minor = $2; $encode_opts = $3; $cont_minor =~ s/;$//; # strip possible semi-colon }; /^ETag: (.*\S)/ and do { $etag = $1; }; /^Content-Length:\s*(\d+)/ and do { $cont_length = $1; }; } } # split_range - take a range of packets (single digit, comma sep, x-y range # and break into list sub split_range { my ($range) = @_; my $index; my $index2; my %Range; # Hash being used to track indexes to ensure uniqueness while ($range) { $range =~ /^(\d+)(.*)/ and do { $index = $1; # save index before pushing to list $Range{$index}=1; $range = $2; next; }; $range =~ /^,(.*)/ and do { $range = $1; next; }; $range =~ /^-(\d+)(.*)/ and do { $index2 = $1; $range = $2; $index++; # $index already on the stack for (my $i=$index; $i<=$index2; $i++) { $Range{$i}=1; } next; }; } return (sort keys %Range); } sub get_suffix { my ($type) = @_; if ($type =~ /gzip/i) { return('gz'); } elsif ($type =~ /jpeg/i) { return('jpg'); } else { return($type); } } sub dump_data { my ($IN, $path_out, $bytes, $encoding) = @_; my $READBLOCK = 8096; my $countdown_bytes = $bytes; my $data; my $blocks_in; my $file; if ($encoding eq 'gzip') { # Need to dump contents of zip $file = $dir . $$ . '.' . get_suffix($encoding); } else { $file = $path_out; } print "Dumping file $path_out\n"; open (DATA, ">$file") or die "Can't create output file $file: $!\n"; binmode(DATA); while ($countdown_bytes > $READBLOCK) { $blocks_in = read($IN, $data, $READBLOCK) or die "can't read bytes: $!\n"; print DATA $data; $countdown_bytes -= $blocks_in; } if ($countdown_bytes) { $blocks_in = read($IN, $data, $countdown_bytes) or die "can't read bytes: $!\n"; print DATA $data; } close (DATA); if ($encoding eq 'gzip') { # Now need to unzip gunzip $file => $path_out, AutoClose => 1 or die "gunzip failed: $GunzipError\n"; unlink ($file); } if ($md5) { open(FILE, "<$path_out") or die "Can't open '$path_out': $!\n"; binmode(FILE); print "Writing md5 hash to ${path_out}.md5\n"; open (MD5, ">${path_out}.md5") or die "Can't open '${path_out}.md5': $!\n"; print MD5 Digest::MD5->new->addfile(*FILE)->hexdigest, " ", basename($path_out), "\n"; close (FILE); close (MD5); } } sub index_digits { my ($max_index) = @_; my $digits = 1; for (my $i=$max_index; $i>=10; $i /= 10){$digits++;}; return($digits); } sub shorthelp { my ($exit) = @_; print "Usage: $0 --read|-r \n"; print " $0 --read|-r (--dump|-d or --all|a)\n"; print " [--write|-w ] [--dir|-D ] "; print "[--md5|-m]\n"; print " $0 --help|-h\n"; exit($exit); } 0; __END__ =pod =head1 NAME B - index and extract email messages from an SMTP stream. =head1 SYNOPSIS B [-s|--smtp SMTPSTREAM] B -e|--extract INDEX [-p|--prefix MSGPREFIX] [-d|--dir MSGDIRECTORY] B -h|--help|-? =head1 DESCRIPTION B reads a captured SMTP session and indexes or extracts email messages contained in the session. The default behavior of the script is to list the messages contained in the session, showing the envelope from address, envelope to address, and size of the message. An index number is assigned to each message. The script can use this index number to extract that message from the session. See the B<--extract> option below for details. For details on the SMTP protocol, see RFC5321: http://tools.ietf.org/html/rfc5321 =head1 OPTIONS =over 8 =item B<-s|--smtp SMTPSTREAM> The SMTP dialogue between a client and SMTP server. With no other options, the script displays an index of email messages contained in the SMTP stream. A single SMTP session can contain one or more email messages. =item B<-e|--extract INDEX> Extract the message referenced by INDEX number. Two files will be created, an _envelope file, containing the envelope headers of the extracted message, and a _content file, containing the actual contents of the email. =item B<-p|--prefix MSGPREFIX> The filename prefix of the extracted files. The two filenames created are PREFIXFILE_INDEX_envelope and PREFIXFILE_INDEX_content. The default prefix is "msg". =item B<-d|--dir MSGDIRECTORY> The directory where extracted _envelope and _content files should be saved. If the directory does not exist it will be created. The default directory for extracted messages is the current working directory. =item B<-h|--help|-?> The -h and --help options print this help menu. The -? option will print a quick usage summary message. =back