Writing an apache access log parser isn’t that hard. Below is a parser that does just that. It creates Data::Dumper output of all the lines. No warranty.
use Data::Dumper;
use Parse::RecDescent;
$Parse::RecDescent::skip = '';
my $grammar = q{
line: ip ws '-' ws user ws datetime ws request ws status ws responsesize
ws referrer ws useragent "\n"
{ $return = {
ip => $item[1],
user => $item[5],
datetime => $item[7],
method => $item[9]->{method},
url => $item[9]->{url},
protocol => $item[9]->{protocol},
status => $item[11],
size => $item[13],
referrer => $item[15],
useragent => $item[17],
} }
user: '-' | /\w+/
request: '"' method ws url ws protocol '"'
{ $return = { method => $item[2], url => $item[4], protocol => $item[6] } }
datetime: '[' date ':' time ws timezone ']'
{ $return = $item[2] . ' ' . $item[4] . ' ' . $item[6] }
status: /\d{3}/
protocol: 'HTTP/' version
method: 'GET' | 'POST' | 'PUT' | 'DELETE'
ws: /[ ]+/
url: /\S+/
referrer: quotedstring2
responsesize: '-' | /\d+/
useragent: quotedstring2
date: day '/' month '/' year
{ $return = join('/', $item[1], $item[3], $item[5]) }
day: /\d+/
month: 'Jan' | 'Feb' | 'Mar' | 'Apr' | 'May' | 'Jun' |
'Jul' | 'Aug' | 'Sep' | 'Oct' | 'Nov' | 'Dec'
year: /\d{4}/
time: /\d{2}:\d{2}:\d{2}/
timezone: ('+'|'-') /\d{4}/ { $return = $item[1].$item[2] }
octet: /\d+/
ip: octet ('.' octet)(3) { $return = $item[1] . '.' . join('.', @{$item[2]}) }
version: /\d.\d/
quotedstring2: '"' /[^"]+/ '"' {$return = $item[2]}
};
my $parser = Parse::RecDescent->new($grammar) or die "Bad Grammer";
while (<>) {
my $ret = $parser->line($_) or print "Parse error\n";
print Dumper($ret);
}