#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import datetime
import hashlib
import json
import re
import sys

import lib.args
import lib.base
import lib.db_sqlite
import lib.disk
import lib.human
import lib.icinga
import lib.lftest
import lib.shell
import lib.time
import lib.txt
from lib.globals import STATE_OK, STATE_UNKNOWN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026042201'

DESCRIPTION = """Queries the systemd journal using journalctl and alerts when matching entries are found.
Supports all journalctl filtering options such as --unit, --priority, --facility,
--identifier, and --grep. Useful for monitoring specific log patterns in real time.
Optionally integrates with Icinga: when the service is acknowledged, the matching
events are suppressed on following runs so they don't re-alert.
Requires root or sudo."""

ACK_RETENTION_DAYS = 30

DEFAULT_FACILITY = None
DEFAULT_ICINGA_CALLBACK = False
DEFAULT_IDENTIFIER = None
DEFAULT_INSECURE = True
DEFAULT_NO_PROXY = False
DEFAULT_PRIORITY = 'emerg..err'
DEFAULT_SERVERITY = 'warn'
DEFAULT_SINCE = '-8h'
DEFAULT_TIMEOUT = 5
DEFAULT_UNIT = None
DEFAULT_USER_UNIT = None

# don't sort JOURNALD_PRIOS alphabetically, we need the indexes (0 = emerg etc.)
JOURNALD_PRIOS = [
    'emerg',
    'alert',
    'crit',
    'err',
    'warning',
    'notice',
    'info',
    'debug',
]


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--facility',
        help='Filter output by syslog facility (passed to journalctl). '
        'Takes a comma-separated list of numbers or facility names. '
        'Default: %(default)s',
        dest='FACILITY',
        default=DEFAULT_FACILITY,
    )

    parser.add_argument(
        '--icinga-callback',
        help='Get the service acknowledgement from Icinga. When the service is '
        'acknowledged, the currently reported journald events are persisted as '
        '"already handled" so they no longer trigger alerts on following runs. '
        'Default: %(default)s',
        dest='ICINGA_CALLBACK',
        action='store_true',
        default=DEFAULT_ICINGA_CALLBACK,
    )

    parser.add_argument(
        '--icinga-password',
        help='Icinga API password.',
        dest='ICINGA_PASSWORD',
    )

    parser.add_argument(
        '--icinga-service-name',
        help='Unique name of the service using this check within Icinga. '
        'Take it from the `__name` service attribute. '
        'Example: `icinga-server!my-service-name`.',
        dest='ICINGA_SERVICE_NAME',
    )

    parser.add_argument(
        '--icinga-url',
        help='Icinga API URL. Example: `https://icinga-server:5665`.',
        dest='ICINGA_URL',
    )

    parser.add_argument(
        '--icinga-username',
        help='Icinga API username.',
        dest='ICINGA_USERNAME',
    )

    parser.add_argument(
        '--identifier',
        help='Show messages for the specified syslog identifier (passed to journalctl). '
        'Default: %(default)s',
        dest='IDENTIFIER',
        default=DEFAULT_IDENTIFIER,
    )

    parser.add_argument(
        '--ignore-pattern',
        help='Any line containing this case-sensitive string in the MESSAGE field will be ignored. '
        'Can be specified multiple times. '
        'Unlike journalctl, this allows easy string-based filtering.',
        action='append',
        default=None,
        dest='IGNORE_PATTERN',
    )

    parser.add_argument(
        '--ignore-regex',
        help='Any line matching this Python regex on the MESSAGE field will be ignored. '
        'Can be specified multiple times. '
        "Example: `--ignore-regex='(?i)linuxfabrik'`.",
        action='append',
        default=None,
        dest='IGNORE_REGEX',
    )

    parser.add_argument(
        '--insecure',
        help=lib.args.help('--insecure'),
        dest='INSECURE',
        action='store_true',
        default=DEFAULT_INSECURE,
    )

    parser.add_argument(
        '--no-proxy',
        help=lib.args.help('--no-proxy'),
        dest='NO_PROXY',
        action='store_true',
        default=DEFAULT_NO_PROXY,
    )

    parser.add_argument(
        '--priority',
        help='Filter output by message priorities or priority ranges (passed to journalctl). '
        'Default: %(default)s',
        dest='PRIORITY',
        default=DEFAULT_PRIORITY,
    )

    parser.add_argument(
        '--severity',
        help='Severity for alerts when journalctl returns results. '
        'Default: %(default)s',
        dest='SEVERITY',
        default=DEFAULT_SERVERITY,
        choices=['warn', 'crit'],
    )

    parser.add_argument(
        '--since',
        help='Show entries on or newer than the specified date (passed to journalctl). '
        'Default: %(default)s',
        dest='SINCE',
        default=DEFAULT_SINCE,
    )

    parser.add_argument(
        '--test',
        help=lib.args.help('--test'),
        dest='TEST',
        type=lib.args.csv,
    )

    parser.add_argument(
        '--timeout',
        help=lib.args.help('--timeout') + ' Default: %(default)s (seconds)',
        dest='TIMEOUT',
        type=int,
        default=DEFAULT_TIMEOUT,
    )

    parser.add_argument(
        '--unit',
        help='Show messages for the specified systemd unit UNIT|PATTERN (passed to journalctl). '
        'Can be specified multiple times. '
        'Default: %(default)s',
        dest='UNIT',
        default=DEFAULT_UNIT,
        action='append',
    )

    parser.add_argument(
        '--user-unit',
        help='Show messages for the specified user session unit (passed to journalctl). '
        'Can be specified multiple times. '
        'Default: %(default)s',
        dest='USER_UNIT',
        default=DEFAULT_USER_UNIT,
        action='append',
    )

    args, _ = parser.parse_known_args()
    return args


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # set default values for append parameters that were not specified
    if args.IGNORE_PATTERN is None:
        args.IGNORE_PATTERN = []
    if args.IGNORE_REGEX is None:
        args.IGNORE_REGEX = []

    if args.ICINGA_CALLBACK and not all(
        (
            args.ICINGA_URL,
            args.ICINGA_PASSWORD,
            args.ICINGA_USERNAME,
            args.ICINGA_SERVICE_NAME,
        )
    ):
        lib.base.cu(
            '`--icinga-callback` requires `--icinga-url`, `--icinga-password`, `--icinga-username` and `--icinga-service-name`.'
        )

    # Persisted ack state is only needed when the Icinga callback is in use.
    # When it is, each unique combination of filter arguments gets its own
    # state DB so two Icinga services watching the journal with different
    # filters do not share ack state.
    acked_fingerprints = set()
    ack_conn = None
    if args.ICINGA_CALLBACK:
        instance_payload = json.dumps(
            {
                'facility': args.FACILITY,
                'identifier': args.IDENTIFIER,
                'ignore_pattern': sorted(args.IGNORE_PATTERN),
                'ignore_regex': sorted(args.IGNORE_REGEX),
                'priority': args.PRIORITY,
                'since': args.SINCE,
                'unit': sorted(args.UNIT) if args.UNIT else [],
                'user_unit': sorted(args.USER_UNIT) if args.USER_UNIT else [],
            },
            sort_keys=True,
        ).encode('utf-8')
        instance_hash = hashlib.sha256(instance_payload).hexdigest()[:10]
        ack_db_filename = (
            f'linuxfabrik-monitoring-plugins-journald-query-{instance_hash}.db'
        )
        ack_conn = lib.base.coe(lib.db_sqlite.connect(filename=ack_db_filename))
        definition = """
            event_hash TEXT NOT NULL PRIMARY KEY,
            event_timestamp INTEGER NOT NULL,
            acknowledged_at TIMESTAMP NOT NULL
        """
        lib.base.coe(
            lib.db_sqlite.create_table(
                ack_conn, definition, table='acknowledged_events'
            )
        )
        # Prune ack records that are older than ACK_RETENTION_DAYS to keep
        # the DB bounded. By that age the event has rotated out of the
        # journal and can no longer re-appear anyway.
        retention_cutoff = lib.time.now(as_type='datetime') - datetime.timedelta(
            days=ACK_RETENTION_DAYS
        )
        lib.base.coe(
            lib.db_sqlite.delete(
                ack_conn,
                """
                DELETE FROM acknowledged_events
                WHERE acknowledged_at <= :cutoff
                """,
                {'cutoff': retention_cutoff},
            )
        )
        rows = lib.base.coe(
            lib.db_sqlite.select(
                ack_conn,
                'SELECT event_hash FROM acknowledged_events',
                fetchone=False,
            )
        )
        acked_fingerprints = {row['event_hash'] for row in rows}

    # fetch data
    if args.TEST is None:
        cmd = 'journalctl '
        # cmd += '--boot '  # logs for the current boot will be shown
        cmd += '--reverse '
        cmd += '--quiet '
        cmd += '--output=json '
        cmd += f'--priority={args.PRIORITY} '
        cmd += f'--since={args.SINCE} '
        if args.FACILITY:
            cmd += f'--facility={args.FACILITY} '
        if args.IDENTIFIER:
            cmd += f'--identifier={args.IDENTIFIER} '
        # unfortunately only for newer journalctl commands:
        # cmd += '--output-fields=UNIT,_SYSTEMD_UNIT,_SYSTEMD_SLICE,PRIORITY,MESSAGE '
        if args.UNIT is None and args.USER_UNIT is None:
            # Pre-define a standard set on basic system services we want to warn about,
            # found on fresh rhel 7+, ubuntu 16+ and debian 9+ systems altogether, if no unit
            # is provided. And yes, if called without any --unit parameter(s), we therefore ignore
            # errors on any specific application services like httpd etc. To check for application
            # services, call this check separately using --unit=httpd, for example.
            # Attention: '*' is the only wildcard that works.
            units = [
                '--unit="accounts-daemon.service"',
                '--unit="acpid.service"',
                '--unit="apparmor.service"',
                '--unit="apport.service"',
                '--unit="auditd.service"',
                '--unit="cron.service"',
                '--unit="crond.service"',
                '--unit="dbus.service"',
                '--unit="dracut-*.service"',
                '--unit="haveged.service"',
                '--unit="ifplugd.service"',
                '--unit="ifup@*.service"',
                '--unit="init.scope"',
                '--unit="irqbalance.service"',
                '--unit="iscsid.service"',
                '--unit="lvm2-*.service"',
                '--unit="lxcfs.service"',
                '--unit="mdadm.service"',
                '--unit="network.service"',
                '--unit="NetworkManager*.service"',
                '--unit="open-iscsi.service"',
                '--unit="polkit.service"',
                '--unit="polkitd.service"',
                '--unit="qemu-guest-agent.service"',
                '--unit="rsyslog.service"',
                '--unit="session-*.scope"',
                '--unit="snapd*.service"',
                '--unit="ssh.service"',
                '--unit="sshd*.service"',
                '--unit="sssd.service"',
                '--unit="sysstat.service"',
                '--unit="systemd-*.service"',
                '--unit="user@*.service"',
            ]
            cmd += ' '.join(units)
        if args.UNIT is not None:
            for unit in args.UNIT:
                cmd += f'--unit="{unit}" '
        if args.USER_UNIT is not None:
            for unit in args.USER_UNIT:
                cmd += f'--user-unit="{unit}" '
        cmd = cmd.strip()
        stdout, stderr, _retc = lib.base.coe(lib.shell.shell_exec(cmd))
        if stderr:
            lib.base.cu(stderr)
    else:
        # do not call the command, put in test data
        cmd = 'no-real-command-used'
        stdout, stderr, _retc = lib.lftest.test(args.TEST)

    # init some vars
    cnt = 0
    shortened = False
    state = STATE_OK
    table_data = []

    # analyze data
    if stdout:
        # found something, so nothing good
        state = lib.base.str2state(args.SEVERITY)
        result = stdout.splitlines()

        compiled_ignore_regex = [re.compile(item) for item in args.IGNORE_REGEX]
        for item in result:
            try:
                event = json.loads(item)
            except Exception:
                lib.base.cu(f'Unable to interpret journald event: {item}')

            if event['MESSAGE'] is None:
                continue
            if any(
                ignore_pattern in event['MESSAGE']
                for ignore_pattern in args.IGNORE_PATTERN
            ) or any(item.search(event['MESSAGE']) for item in compiled_ignore_regex):
                continue
            # Stable per-event fingerprint for ack persistence. Kept
            # deliberately simple (timestamp + message) so the same hash is
            # derived the next time journalctl returns the same entry.
            event_fingerprint = hashlib.sha256(
                (
                    f'{event.get("__REALTIME_TIMESTAMP", "")}|'
                    f'{event.get("MESSAGE", "")}'
                ).encode()
            ).hexdigest()
            if event_fingerprint in acked_fingerprints:
                continue
            event['_fingerprint'] = event_fingerprint
            # shorten message if necessary
            if len(event['MESSAGE']) > 80:
                event['MESSAGE'] = event['MESSAGE'][0:77] + '...'

            try:
                event['unit'] = event['UNIT'].replace('.service', '')
            except Exception:
                try:
                    event['unit'] = event['_SYSTEMD_UNIT'].replace('.service', '')
                except Exception:
                    event['unit'] = event['_SYSTEMD_SLICE'].replace('.service', '')
            event['priority'] = JOURNALD_PRIOS[int(event['PRIORITY'])]
            event['timestamp'] = lib.time.epoch2iso(
                int(event['__REALTIME_TIMESTAMP']) / 1000000
            )

            table_data.append(event)

        cnt = len(table_data)
        if cnt == 0:
            state = STATE_OK
        if cnt > 10:
            # shorten the message
            table_data = table_data[0:5] + table_data[-5:]
            shortened = True
        else:
            shortened = False

    # Ask Icinga about the service acknowledgement. If acknowledged, persist
    # the fingerprints of the events that are currently being reported so
    # they do not re-alert on following runs, and return OK to Icinga. See
    # issue #649.
    msg_addendum = ''
    if args.ICINGA_CALLBACK and state != STATE_OK:
        success, icinga = lib.icinga.get_service(
            args.ICINGA_URL,
            args.ICINGA_USERNAME,
            args.ICINGA_PASSWORD,
            servicename=args.ICINGA_SERVICE_NAME,
            attrs='state,acknowledgement',
            insecure=args.INSECURE,
            no_proxy=args.NO_PROXY,
            timeout=args.TIMEOUT,
        )
        if success:
            try:
                if icinga['results'][0]['attrs']['acknowledgement']:
                    now_dt = lib.time.now(as_type='datetime')
                    for event in table_data:
                        lib.base.coe(
                            lib.db_sqlite.replace(
                                ack_conn,
                                {
                                    'event_hash': event['_fingerprint'],
                                    'event_timestamp': int(
                                        event.get('__REALTIME_TIMESTAMP', 0)
                                    ),
                                    'acknowledged_at': now_dt,
                                },
                                table='acknowledged_events',
                            )
                        )
                    state = STATE_OK
                else:
                    msg_addendum += (
                        'Note: Acknowledge this service to reset the state to OK.'
                    )
            except IndexError:
                msg_addendum += (
                    'Note: Could not determine the acknowledgement from the '
                    'Icinga API, this could be due to an incorrect service name.'
                )
        else:
            msg_addendum += (
                f'Note: Could not determine the acknowledgement from the Icinga API:\n{icinga}.'
            )

    if ack_conn is not None:
        lib.base.coe(lib.db_sqlite.commit(ack_conn))
        lib.db_sqlite.close(ack_conn)

    # build the message
    if table_data:
        sev_str = lib.base.state2str(
            lib.base.str2state(args.SEVERITY),
            prefix=' ',
        )
        msg = (
            f'{cnt}'
            f' {lib.txt.pluralize("event", cnt)}.'
            f' Latest event at {table_data[0]["timestamp"]}'
            f' from {table_data[0]["unit"]},'
            f' level {table_data[0]["priority"]}:'
            f' `{table_data[0]["MESSAGE"]}`{sev_str}'
        )
        if shortened:
            msg += (
                '\nAttention: Table below is truncated, showing the 5 newest and '
                'the 5 oldest messages.'
            )
        msg += '\n\n' + lib.base.get_table(
            table_data,
            [
                'timestamp',
                'unit',
                'priority',
                'MESSAGE',
            ],
            header=[
                'Timestamp',
                'Unit',
                'Prio',
                'Message',
            ],
        )
        if args.UNIT is None:
            msg += (
                f'\nUse `journalctl --reverse'
                f' --priority={args.PRIORITY}'
                f' --since={args.SINCE}`'
                f' as a starting point for debugging.'
                f' Be aware of the fact that you may'
                f' see even more messages then, as we'
                f' use a lot of unit filters to get'
                f' only messages from basic system'
                f' services.'
            )
    else:
        # Status line analog to the logfile check: name what was queried,
        # the hit count, the filters the query ran with and any ignores.
        def _join_and(items):
            if len(items) <= 1:
                return ''.join(items)
            if len(items) == 2:
                return f'{items[0]} and {items[1]}'
            return f'{", ".join(items[:-1])} and {items[-1]}'

        filter_parts = [
            f"priority='{args.PRIORITY}'",
            f"since='{args.SINCE}'",
        ]
        if args.UNIT:
            filter_parts.append(
                f'units {_join_and([f"{u!r}" for u in sorted(args.UNIT)])}'
            )
        if args.USER_UNIT:
            filter_parts.append(
                f'user-units {_join_and([f"{u!r}" for u in sorted(args.USER_UNIT)])}'
            )
        if args.FACILITY:
            filter_parts.append(f"facility='{args.FACILITY}'")
        if args.IDENTIFIER:
            filter_parts.append(f"identifier='{args.IDENTIFIER}'")

        ignore_parts = [
            f"'{item}'" for item in sorted(args.IGNORE_PATTERN + args.IGNORE_REGEX)
        ]

        msg = (
            f'Queried the systemd journal (0 events) using '
            f'{_join_and(filter_parts)}'
        )
        if ignore_parts:
            msg += f', ignoring {_join_and(ignore_parts)}'
        msg += '.'
        state = STATE_OK

    full_cmd = (
        cmd.replace(
            ' --quiet',
            '',
        )
        .replace(
            ' --output=json',
            '',
        )
        .replace('\\', '')
    )
    msg += f'\nThe full command used was:\n`{full_cmd}`'
    perfdata = lib.base.get_perfdata(
        'journald-query',
        cnt,
        _min=0,
    )

    if msg_addendum:
        msg += '\n\n' + msg_addendum

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
