Parsing nginx access- and error-logs
.py
ETL
This is a collection of several implementations of ~parsing nginx access- and error-logs, with a deal for someday, to compare their performance.
- straightforward, regex-only – expected as most robust and fast,
- OOP-styled, with
dataclasses
– expected as nice,
and this one, just for comparison
- OOP-styled, with
NamedTuple
– just for comparison.
OOP vs. non-OOP code
the gist itself
Non-OOP and dataclasses
-based solutions
import dataclasses
import re
from datetime import datetime
from typing import ClassVar, Optional
@dataclasses.dataclass(frozen=True)
class NgxAccessLine:
str
remote_addr: str
remote_user: str | datetime
ts: str
request: str | int
status: str | int
body_bytes_sent: str
http_referer: str
http_user_agent:
bool] = False
DO_CAST: ClassVar[
# RegEx for default `combined` nginx access log format
# $remote_addr - $remote_user [$time_local]
# "$request" $status $body_bytes_sent
# "$http_referer" "$http_user_agent"
= re.compile(
access_line_pattern: ClassVar[re.Pattern] r'^'
r'([0-9.]+) - (-|\S+) \[([^]]+)\] '
r'"(.*)" (\d+) (\d+) '
r'"(.*)" "(.*)"'
r'$'
)
@classmethod
def from_string(cls, s):
if m := cls.access_line_pattern.match(s):
= dict(zip(
kwargs 'remote_addr', 'remote_user', 'ts',
('request', 'status', 'body_bytes_sent',
'http_referer', 'http_user_agent'),
m.groups()
))
if cls.DO_CAST:
kwargs.update({'ts': datetime.strptime(kwargs['ts'], '%d/%b/%Y:%H:%M:%S %z'),
'status': int(kwargs['status']),
'body_bytes_sent': int(kwargs['body_bytes_sent']),
})
return cls(**kwargs)
@classmethod
def fields(cls):
return tuple(f.name for f in dataclasses.fields(cls))
def as_tuple(self):
return dataclasses.astuple(self)
def as_dict(self):
return dataclasses.asdict(self)
@dataclasses.dataclass(frozen=True)
class NgxErrLine:
ts: datetimestr
level: int
pid: int
tid: int | None
cid: str
msg: # Additional attributes revealed from `msg`
str] = None
remote_addr: Optional[
bool] = False
DO_CAST: ClassVar[
# Nginx error log format
# YYYY/MM/DD HH:MM:SS [level] pid#tid: *conn_number message
# ref: nginx/src/core/ngx_log.c, nginx/src/core/ngx_connection.h
= re.compile(
error_line_pattern: ClassVar[re.Pattern] r'^'
r'([\d/]+ [\d:]+) \[(\w+)\] (\d+)#(\d+):(?: \*(\d+))? (.*)'
r'$'
)
= re.compile(r'\bclient: (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b')
msg_client_pattern
@classmethod
def from_string(cls, s):
if m := cls.error_line_pattern.match(s):
= dict(zip(
kwargs 'ts', 'level', 'pid', 'tid', 'cid', 'msg'),
(
m.groups()
))
# Additional attributes revealed from `msg`
if remote_addr := cls.msg_client_pattern.search(kwargs['msg']):
'remote_addr': remote_addr.group(1)})
kwargs.update({
if cls.DO_CAST:
kwargs.update({'ts': datetime.strptime(kwargs['ts'], '%Y/%m/%d %H:%M:%S'),
'pid': int(kwargs['pid']),
'tid': int(kwargs['tid']),
'cid': int(kwargs['cid']) if kwargs['cid'] else None,
})
return cls(**kwargs)
@classmethod
def fields(cls):
return tuple(f.name for f in dataclasses.fields(cls))
def as_tuple(self):
return dataclasses.astuple(self)
def as_dict(self):
return dataclasses.asdict(self)
Now we can compare Simple non-obj-oriented solution and dataclasses
-based one (note: with or without DO_CAST
option).
import bz2
import csv
import gzip
from datetime import datetime
from pathlib import Path
# from ngxlogz_namdtup import NgxAccessLine, NgxErrLine # obsoleted; for comparison
from ngxlogz import NgxAccessLine, NgxErrLine
= Path('~/tmp/ngx').expanduser()
IN_LOGDIR = Path('~/tmp/ngx/az.csv.bz2').expanduser()
OUT_FILE_A = Path('~/tmp/ngx/ez.csv.bz2').expanduser()
OUT_FILE_E = False
DO_CAST
def ngx_access_linebyline(in_logdir=Path('/var/log/nginx'), glob_pattern='access.log-*'):
"""Generate log lines from all the nginx access.log files."""
for fpath in sorted(Path(in_logdir).glob(glob_pattern),
=lambda fp: fp.stat().st_mtime):
keywith gzip.open(fpath, 'rt') if fpath.suffix == '.gz' else open(fpath) as fo:
yield from map(str.rstrip, fo.readlines())
def ngx_err_linebyline(in_logdir=Path('/var/log/nginx'), glob_pattern='error.log-*'):
"""Generate log lines from all the nginx error.log files."""
for fpath in sorted(Path(in_logdir).glob(glob_pattern),
=lambda fp: fp.stat().st_mtime):
keywith gzip.open(fpath, 'rt') if fpath.suffix == '.gz' else open(fpath) as fo:
yield from map(str.rstrip, fo.readlines())
def dump_csv_bz2(fpath, header, rows):
"""A .csv.bz2 writer.
Pass a generator as `rows`, to benchmark objects creation and transformations."""
= datetime.now()
t0 with bz2.open(fpath, 'wt', newline='') as fo:
= csv.writer(fo)
cw
cw.writerow(header)
cw.writerows(rows)print(datetime.now() - t0)
## ---------------------------------------------------------------------
## -- Simple non-obj-oriented solution, for performance comparison
print('Processing access-logs fast')
= NgxAccessLine.access_line_pattern
p
def parse_access_logline(s, do_cast=DO_CAST):
= list(p.match(s).groups())
g if do_cast:
2] = datetime.strptime(g[2], '%d/%b/%Y:%H:%M:%S %z')
g[4] = int(g[4])
g[5] = int(g[5])
g[return g
dump_csv_bz2('.tmp.csv.bz2'),
OUT_FILE_A.with_suffix(
NgxAccessLine.fields(),map(parse_access_logline, ngx_access_linebyline(IN_LOGDIR))
)
## ---------------------------------------------------------------------
= DO_CAST
NgxAccessLine.DO_CAST = DO_CAST
NgxErrLine.DO_CAST
print('Processing access-logs with our module utilizing dataclasses')
dump_csv_bz2(
OUT_FILE_A,
NgxAccessLine.fields(),for s in ngx_access_linebyline(IN_LOGDIR))
(NgxAccessLine.from_string(s).as_tuple()
)
print('Processing error-logs with our module utilizing dataclasses')
dump_csv_bz2(
OUT_FILE_E,
NgxErrLine.fields(),for s in ngx_err_linebyline(IN_LOGDIR))
(NgxErrLine.from_string(s).as_tuple() )
An addition
NamedTuple
-based solution
import re
from datetime import datetime
from typing import ClassVar, NamedTuple, Optional
class NgxAccessLine(NamedTuple):
str
remote_addr: str
remote_user:
time_local: datetimestr
request: int
status: int
body_bytes_sent: str
http_referer: str
http_user_agent:
# Default `combined` nginx access log format
# '$remote_addr - $remote_user [$time_local] '
# '"$request" $status $body_bytes_sent '
# '"$http_referer" "$http_user_agent"'
= re.compile(
access_line_pattern: ClassVar[re.Pattern] r'^'
r'([0-9.]{7,15}) - (.*) \[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} \+\d{4})\] '
r'"(.*)" (\d+) (\d+) '
r'"(.*)" "(.*)"'
r'$'
)
@classmethod
def from_string(cls, s):
= access_line_pattern.match(s)
m if m:
(
remote_addr, remote_user, time_local,
request, status, body_bytes_sent,
http_referer, http_user_agent= m.groups()
) return cls(remote_addr, remote_user, datetime.strptime(time_local, '%d/%b/%Y:%H:%M:%S %z'),
int(status), int(body_bytes_sent),
request, http_referer, http_user_agent)
class NgxErrLine(NamedTuple):
ts: datetimestr
level: int
pid: int
tid: int # Optional uint, we use -1 if it is absent
conn_number: str
msg: # Additional err-related attributes revealed from `msg`
int] = None
rel_ip: Optional[# rel_wtf: Optional[str] = None
# Maintained nginx error log format
# YYYY/MM/DD HH:MM:SS [{level}] {pid}#{tid}: *{conn_number} {msg}
# ref: nginx/src/core/ngx_log.c, nginx/src/core/ngx_connection.h
# sample: 2021/04/19 17:48:31 [crit] 5797#5797: *1902 SSL_do_handshake() failed (SSL: error:14201044:SSL routines:tls_choose_sigalg:internal error) while SSL handshaking, client: 0.0.0.0, server: 0.0.0.0:443
= re.compile(
error_line_pattern: ClassVar[re.Pattern] r'^'
r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (\d+)#(\d+):(?: \*(\d+))? (.*)'
r'$'
)
@classmethod
def from_string(cls, s):
= error_line_pattern.match(s)
m if m:
= m.groups()
(ts, level, pid, tid, conn_number, msg)
= dict()
kwargs = re.findall(r'\bclient: (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b', msg)
rel_ip if rel_ip and len(rel_ip) == 1:
'rel_ip': rel_ip.pop()})
kwargs.update({
return cls(
'%Y/%m/%d %H:%M:%S'), level, int(pid), int(tid),
datetime.strptime(ts, int(conn_number) if conn_number else -1, msg, **kwargs
)
Performance measurement
Not complete
As mentioned above, this section is a someday-deal.
Define functions that read logs and return them line by line:
def genaccesslines():
for fn in Path(Path.cwd(), 'tmp/nginx-logz-0718').glob('access.log-*.gz'):
with gzip.open(fn, 'rt') as fo:
yield from map(str.rstrip, fo.readlines())
def generrorlines():
for fn in Path(Path.cwd(), 'tmp/nginx-logz-0718').glob('error.log-*.gz'):
with gzip.open(fn, 'rt') as fo:
yield from map(str.rstrip, fo.readlines())
then, depending on decoding method
= datetime.now() # [TIMING]
t0
with open('tmp/az.csv', 'w', newline='') as fo:
= csv.writer(fo)
w for _ in dataclasses.fields(NgxAccessLine))
w.writerow(_.name map(dataclasses.astuple, map(NgxAccessLine.from_string, genaccesslines())))
w.writerows(
with open('tmp/ez.csv', 'w', newline='') as fo:
= csv.writer(fo)
w for _ in dataclasses.fields(NgxErrLine))
w.writerow(_.name map(dataclasses.astuple, map(NgxErrLine.from_string, generrorlines())))
w.writerows(
print(datetime.now() - t0) # [TIMING]
= datetime.now() # [TIMING]
t0
with open('tmp/az.csv', 'w', newline='') as fo:
= csv.writer(fo)
w
w.writerow(NgxAccessLine._fields)map(NgxAccessLine.from_string, genaccesslines()))
w.writerows(
with open('tmp/ez.csv', 'w', newline='') as fo:
= csv.writer(fo)
w
w.writerow(NgxErrLine._fields)map(NgxErrLine.from_string, generrorlines()))
w.writerows(
print(datetime.now() - t0) # [TIMING]