#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import sys

import lib.args
import lib.base
import lib.lftest
import lib.shell
import lib.txt
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026050703'

DESCRIPTION = """Checks the kernel ring buffer (dmesg) for messages at severity levels emerg, alert,
crit, and err. Known false positives and hardware-specific noise are filtered out by default; the
filtered count is reported as the `errors` perfdata so trends can be graphed. To clear reported
messages after resolving the underlying issue, run "dmesg --clear". Note: the kernel ring buffer is
a fixed-size circular buffer, so older messages are overwritten over time, and timestamps may drift
across SUSPEND/RESUME because the time source is not updated on resume.
Requires root or sudo."""

# Ignore false positives, hardware-specific noise, and bugs without operational impact.
# Patterns are Python regular expressions matched against each dmesg line; keep them
# alphabetical and include the rationale so we can later re-evaluate whether an entry
# still applies.
DEFAULT_IGNORE = [
    # SCSI sd: cache mode page absent (Virtio/USB/SD-card disks); falls back to write-through
    ' Asking for cache data failed',
    ' Assuming drive cache: write through',
    # Broadcom WLAN firmware-load info on Raspberry Pi 3B+ (BCM4345/6); informational, not an error
    ' brcmfmac: brcmf_c_preinit_dcmds: Firmware: BCM4345/6',
    ' brcmfmac: brcmf_fw_alloc_request: using brcm/brcmfmac43455-sdio'
    ' for chip BCM4345/6',
    # CIFS reconnect noise on RHEL 8 / Linux 5.4 and older; demoted to KERN_DEBUG upstream in 2020
    ' CIFS VFS: Free previous auth_key.response = ',
    # Legacy cpufreq init failing to read CPU frequency; older kernels and virt. guests
    r' cpufreq: __cpufreq_add_dev: ->get\(\) failed',
    # Shim/MOK config table not exposed as EFI runtime memory; cosmetic, no Secure Boot impact.
    # Documented for Rocky Linux 8.5, https://rockylinux.org/news/rocky-linux-8-5-ga-release/
    ' EFI MOKvar config table is not in EFI runtime memory',
    # ACPI Error Record Serialization Table not provided by firmware; common on most boards/VMs
    r' ERST: Failed to get Error Log Address Range\.',
    # DRM vsync flip timeout on i915 / virt. GPUs, https://access.redhat.com/solutions/4490391
    ' flip_done timed out',
    # No PS/2 keyboard controller; normal on systems without legacy PS/2 ports
    ' i8042: No controller found',
    # ACPI power_meter: software cap above firmware-declared safe range; kernel honors it but warns
    ' Ignoring unsafe software power cap!',
    # IMA/EVM cannot load kernel-shipped X.509 cert (-126 ENOKEY); MOK keyring not yet populated,
    # https://access.redhat.com/solutions/7049158
    r' integrity: Problem loading X\.509 certificate -126',
    # CIFS DFS referral lookup failure (-5 EIO); cosmetic on shares without DFS,
    # https://access.redhat.com/solutions/3496971
    ' ioctl error in smb2_get_dfs_refer rc=-5',
    # KVM guest writes MSR_IA32_DEBUGCTLMSR (host emulates as no-op); typically Windows guests
    # on KVM/oVirt
    ' kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR ',
    # SCSI sd: same probe path as "Asking for cache data failed"; falls back to write-through
    ' No Caching mode page found',
    # SHPC PCI hot-plug slot already owned by acpiphp/pciehp on virt. PCI bridges (-16 EBUSY);
    # hot-plug keeps working via the other driver. Common on OpenStack/KVM/VMware guests
    ' pci_hp_register failed with error -16',
    ' Slot initialization failed',
    # SMBus controller absent or BIOS-disabled (i2c-piix4 / i2c-i801); no impact on monitoring,
    # https://access.redhat.com/solutions/2115401
    ' SMBus base address uninitialized - upgrade BIOS or use ',
    ' SMBus Host Controller not enabled!',
    # Fast TSC calibration unavailable; kernel falls back to PIT/HPET-based calibration
    ' tsc: Fast TSC calibration failed',
    # KVM guest reads unhandled MSR, https://access.redhat.com/solutions/59299
    ' unhandled rdmsr: ',
    # KVM guest writes unhandled MSR, https://bugzilla.redhat.com/show_bug.cgi?id=874627
    ' unhandled wrmsr: ',
    # KVM guest perfctr writes blocked, https://access.redhat.com/solutions/2188061
    ' vcpu0 disabled perfctr wrmsr',
    # RHEL flags driver as deprecated/unmaintained for the next major release; informational only
    ' Warning: Deprecated Driver is detected',
    ' Warning: Unmaintained driver is detected',
]


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    # Append parameters use `default=None`; the actual default list (`DEFAULT_IGNORE`)
    # is assigned in main() if the user did not pass `--ignore`. Specifying `--ignore`
    # at least once therefore replaces the default list rather than extending it; this
    # matches the convention documented in CONTRIBUTING.md and lets admins curate
    # their own ignore list without inheriting the bundled defaults.
    parser.add_argument(
        '--ignore',
        help='Ignore a kernel message matching this Python regular expression. '
        'Can be specified multiple times. '
        'Specifying this parameter replaces the bundled default ignore list. '
        'Example: `--ignore="^.* unhandled (rd|wr)msr: "`.',
        dest='IGNORE',
        action='append',
        default=None,
    )

    # `--severity` is no longer exposed: kernel ring buffer messages on err level
    # are not a meaningful "warning" in a server-hosting context, so the plugin
    # always alerts as CRIT. Kept hidden via SUPPRESS for backwards compatibility
    # with existing service templates.
    parser.add_argument(
        '--severity',
        help=argparse.SUPPRESS,
        dest='SEVERITY',
    )

    parser.add_argument(
        '--test',
        help=lib.args.help('--test'),
        dest='TEST',
        type=lib.args.csv,
    )

    args, _ = parser.parse_known_args()
    return args


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # apply default ignore list if the admin did not pass --ignore
    if args.IGNORE is None:
        args.IGNORE = DEFAULT_IGNORE

    # compile ignore patterns (one coe per item so we get a per-pattern error message)
    ignore_patterns = [
        lib.base.coe(lib.txt.compile_regex(p, key='--ignore')) for p in args.IGNORE
    ]

    # fetch data
    if args.TEST is None:
        stdout, stderr, retc = lib.base.coe(
            lib.shell.shell_exec('dmesg --level=emerg,alert,crit,err --ctime'),
        )
        if stderr or retc != 0:
            lib.base.cu(stderr)
    else:
        stdout, stderr, retc = lib.lftest.test(args.TEST)

    # init some vars
    msg = ''
    state = STATE_OK
    perfdata = ''

    # analyze data: filter the dmesg output line by line
    all_lines = [line for line in stdout.strip().split('\n') if line]
    errors = [
        line for line in all_lines if not any(p.search(line) for p in ignore_patterns)
    ]
    cnt = len(errors)

    # build the message
    if cnt > 0:
        # shorten the message to first 5 and last 5 lines if it gets large
        shown = [*errors[0:5], '...', *errors[-5:]] if cnt > 10 else errors
        msg += (
            f'{cnt} {lib.txt.pluralize("error", cnt)} in Kernel Ring Buffer.\n\n'
            + '\n'.join(
                shown,
            )
        )
        state = STATE_CRIT
    else:
        msg += 'Everything is ok.'

    # build perfdata
    perfdata += lib.base.get_perfdata('errors', cnt, _min=0)

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
