Peter Stuifzand

Write your own Accesslog parser in Perl

Writing an apache access log parser isn’t that hard. Below is a parser that does just that. It creates Data::Dumper output of all the lines. No warranty.

use Data::Dumper;
use Parse::RecDescent;

$Parse::RecDescent::skip = '';

my $grammar = q{
line: ip ws '-' ws user ws datetime ws request ws status ws responsesize
            ws referrer ws useragent "\n"
{ $return = {
            ip        => $item[1],
            user      => $item[5],
            datetime  => $item[7],
            method    => $item[9]->{method},
            url       => $item[9]->{url},
            protocol  => $item[9]->{protocol},
            status    => $item[11],
            size      => $item[13],
            referrer  => $item[15],
            useragent => $item[17],
        } }
user: '-' | /\w+/
request: '"' method ws url ws protocol '"'
    { $return = { method => $item[2], url => $item[4], protocol => $item[6] } }
datetime: '[' date ':' time ws timezone ']'
    { $return = $item[2] . ' ' . $item[4] . ' ' . $item[6] }
status: /\d{3}/
protocol: 'HTTP/' version
method: 'GET' | 'POST' | 'PUT' | 'DELETE'
ws: /[ ]+/
url: /\S+/
referrer: quotedstring2
responsesize: '-' | /\d+/
useragent: quotedstring2
date: day '/' month '/' year
    { $return = join('/', $item[1], $item[3], $item[5]) }
day: /\d+/
month: 'Jan' | 'Feb' | 'Mar' | 'Apr' | 'May' | 'Jun' |
    'Jul' | 'Aug' | 'Sep' | 'Oct' | 'Nov' | 'Dec'
year: /\d{4}/
time: /\d{2}:\d{2}:\d{2}/
timezone: ('+'|'-') /\d{4}/  { $return = $item[1].$item[2] }
octet: /\d+/
ip: octet ('.' octet)(3) { $return = $item[1] . '.' . join('.', @{$item[2]}) }
version: /\d.\d/
quotedstring2: '"' /[^"]+/ '"'   {$return = $item[2]}
};

my $parser = Parse::RecDescent->new($grammar) or die "Bad Grammer";
while (<>) {
    my $ret = $parser->line($_) or print "Parse error\n";
    print Dumper($ret);
}
© 2023 Peter Stuifzand