#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import sys
from types import SimpleNamespace

import lib.args
import lib.base
import lib.db_sqlite
import lib.human
import lib.time
import lib.version
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

try:
    import psutil
except ImportError:
    print('Python module "psutil" is not installed.')
    sys.exit(STATE_UNKNOWN)


__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026050801'

DESCRIPTION = """Reports CPU utilization percentages for all available time categories
(user, system, idle, nice, iowait, irq, softirq, steal, guest, guest_nice) plus the overall
cpu-usage (100 - idle - nice).

Thresholds (WARN/CRIT) are checked against user, system, iowait, and cpu-usage. An alert is raised
only if the threshold is exceeded for COUNT consecutive runs, suppressing short spikes and focusing
on sustained load.

Perfdata is emitted for every field to enable full graphing. Extended stats (context switches,
interrupts, etc.) are included if supported on this platform.

This check is cross-platform and works on Linux, Windows, and all psutil-supported systems.
The check stores its short trend state locally in an SQLite DB to evaluate sustained load across
runs."""


DEFAULT_COUNT = (
    5  # measurements; if check runs once per minute, this is a 5 minute interval
)
DEFAULT_CRIT = 90  # %
DEFAULT_WARN = 80  # %


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--count',
        help=lib.args.help('--count') + ' Default: %(default)s',
        dest='COUNT',
        type=int,
        default=DEFAULT_COUNT,
    )

    parser.add_argument(
        '-c',
        '--critical',
        help=lib.args.help('--critical') + ' Default: >= %(default)s',
        dest='CRIT',
        type=int,
        default=DEFAULT_CRIT,
    )

    parser.add_argument(
        '-w',
        '--warning',
        help=lib.args.help('--warning') + ' Default: >= %(default)s',
        dest='WARN',
        type=int,
        default=DEFAULT_WARN,
    )

    args, _ = parser.parse_known_args()
    return args


def _cpu_times_to_dict(ct):
    """Return a dict of cpu_times fields present on this platform."""
    # psutil returns a namedtuple; only keep fields that exist on this OS
    fields = (
        'user',
        'nice',
        'system',
        'idle',
        'iowait',
        'irq',
        'softirq',
        'steal',
        'guest',
        'guest_nice',
    )
    return {f: getattr(ct, f, 0.0) for f in fields}


# * Store a single last raw snapshot of psutil.cpu_times() (cumulative jiffy/seconds counters
#   since boot) + a timestamp in the existing SQLite file.
# * On the next run, read the last snapshot, compute deltas for each field, and turn those
#   into percentages.
# * If there's no prior snapshot (first run, or DB got cleaned), fall back to a short
#   blocking sample (e.g., interval=0.25) so output stays sane.
# * Continue to store the trend rows like in older versions of this plugin.
# This is how psutil computes internally for interval>0, just that we do it across runs instead
# of sleeping within a run.


def cpu_times_percent_nonblocking(conn):
    """Compute a non-blocking equivalent of psutil.cpu_times_percent(percpu=False).
    Falls back to a short blocking sample only on the very first run.
    """
    # create one-row table for the last raw snapshot, if not exists
    definition = """
        ts REAL NOT NULL,
        user REAL DEFAULT 0,
        nice REAL DEFAULT 0,
        system REAL DEFAULT 0,
        idle REAL DEFAULT 0,
        iowait REAL DEFAULT 0,
        irq REAL DEFAULT 0,
        softirq REAL DEFAULT 0,
        steal REAL DEFAULT 0,
        guest REAL DEFAULT 0,
        guest_nice REAL DEFAULT 0
    """
    lib.base.coe(lib.db_sqlite.create_table(conn, definition, table='raw_last'))

    # read last row (if any)
    last = lib.base.coe(
        lib.db_sqlite.select(
            conn,
            'SELECT * FROM raw_last LIMIT 1',
            fetchone=True,
        )
    )

    now_ct = psutil.cpu_times()
    now = lib.time.now()
    now_d = _cpu_times_to_dict(now_ct)

    if last:
        # compute deltas
        deltas = {}
        total = 0.0
        for k, v in now_d.items():
            dv = max(
                0.0, v - float(last.get(k, 0.0))
            )  # guard against clock/counter oddities
            deltas[k] = dv
            total += dv

        # update snapshot for next run
        lib.base.coe(lib.db_sqlite.delete(conn, 'DELETE FROM raw_last WHERE 1=1'))
        lib.base.coe(lib.db_sqlite.insert(conn, {'ts': now, **now_d}, table='raw_last'))
        lib.base.coe(lib.db_sqlite.commit(conn))

        # if total is ~0 (very short time elapsed), fall back to tiny blocking sample
        if total <= 0.0:
            return psutil.cpu_times_percent(interval=0.25, percpu=False)

        # turn deltas into percentages (namedtuple-like simple object)
        pct = SimpleNamespace()
        for k, dv in deltas.items():
            setattr(pct, k, round((dv / total) * 100.0, 1))
        return pct

    # first run: store snapshot and do a short, blocking read to produce sane output
    lib.base.coe(lib.db_sqlite.insert(conn, {'ts': now, **now_d}, table='raw_last'))
    lib.base.coe(lib.db_sqlite.commit(conn))
    # short window to avoid a 1.25s stall like in previos versions while
    # still getting stable numbers
    return psutil.cpu_times_percent(interval=0.25, percpu=False)


def get_from_db(conn, threshold):
    """Return the number of perfdata rows where CPU usage fields exceed the given threshold.

    Args:
        conn: SQLite connection object.
        threshold (int|float): Threshold value to compare against.

    Returns:
        int: Count of rows exceeding the threshold.
    """
    result = lib.base.coe(
        lib.db_sqlite.select(
            conn,
            """
        SELECT count(*) as cnt
        FROM perfdata
        WHERE user > :user
           or system > :system
           or iowait > :iowait
           or cpu_usage > :cpu_usage
        """,
            {
                'user': threshold,
                'system': threshold,
                'iowait': threshold,
                'cpu_usage': threshold,
            },
            fetchone=True,
        )
    )
    return int(result['cnt'])


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # init some vars
    msg = ''
    perfdata = ''
    state = STATE_OK
    stats = {}
    extstats = {}

    # create the db table
    definition = """
        guest REAL DEFAULT NULL,
        guest_nice REAL DEFAULT NULL,
        idle REAL DEFAULT NULL,
        iowait REAL DEFAULT NULL,
        irq REAL DEFAULT NULL,
        nice REAL DEFAULT NULL,
        softirq REAL DEFAULT NULL,
        steal REAL DEFAULT NULL,
        system REAL DEFAULT NULL,
        user REAL DEFAULT NULL,
        cpu_usage REAL NOT NULL
    """
    conn = lib.base.coe(
        lib.db_sqlite.connect(
            filename='linuxfabrik-monitoring-plugins-cpu-usage.db',
        )
    )
    lib.base.coe(lib.db_sqlite.create_table(conn, definition))

    # Best-effort: reduce IO stalls and file locking on Windows without changing outputs
    # (Ignore errors if the underlying sqlite wrapper/driver doesn't expose execute)
    try:
        conn.execute('PRAGMA journal_mode=WAL')
        conn.execute('PRAGMA synchronous=NORMAL')
    except Exception:
        pass

    # Grab CPU stats using psutil's cpu_times_percent
    # https://github.com/Linuxfabrik/monitoring-plugins/issues/57: changed from 0.25 to 1.25
    try:
        # OLD (blocking; previous versions of this plugin):
        # cpu_times_percent = psutil.cpu_times_percent(interval=1.25, percpu=False)

        # NEW (non-blocking, with first-run 0.25s fallback):
        cpu_times_percent = cpu_times_percent_nonblocking(conn)
    except ValueError:
        lib.db_sqlite.close(conn)
        lib.base.cu('psutil raised an error')

    stats['guest'] = getattr(cpu_times_percent, 'guest', 0)
    stats['guest_nice'] = getattr(cpu_times_percent, 'guest_nice', 0)
    stats['idle'] = getattr(cpu_times_percent, 'idle', 0)
    stats['iowait'] = getattr(cpu_times_percent, 'iowait', 0)
    stats['irq'] = getattr(cpu_times_percent, 'irq', 0)
    stats['nice'] = getattr(cpu_times_percent, 'nice', 0)
    stats['softirq'] = getattr(cpu_times_percent, 'softirq', 0)
    stats['steal'] = getattr(cpu_times_percent, 'steal', 0)
    stats['system'] = getattr(cpu_times_percent, 'system', 0)
    stats['user'] = getattr(cpu_times_percent, 'user', 0)

    # Guard against bogus all-zero samples (#626).
    #
    # psutil.cpu_times_percent() can return 0% for ALL fields (including idle)
    # when the cumulative CPU time counters do not change between two samples.
    # This happens on some Windows systems with many cores (64+, multiple
    # processor groups), where the underlying GetSystemTimes() counters
    # occasionally stall or go backwards. psutil clips negative deltas to zero
    # (see psutil issues #392, #645, #1210), which can result in a total delta
    # of zero. In that case psutil returns 0% for every field.
    #
    # Without this guard, our formula "100 - idle(0) - nice(0)" would
    # incorrectly report 100% CPU usage. We detect this physically impossible
    # state (some CPU time MUST pass) and skip the sample entirely, so no
    # bogus data is stored or alerted on.
    if stats['idle'] == 0 and stats['user'] == 0 and stats['system'] == 0:
        lib.db_sqlite.close(conn)
        lib.base.oao(
            'Waiting for more data (got an all-zero CPU sample, skipping).',
            STATE_OK,
            always_ok=args.ALWAYS_OK,
        )

    # this is what we want to warn about: 100% - idle - nice
    stats['cpu_usage'] = round(100.0 - stats['idle'] - stats['nice'], 1)

    # save trend data to local sqlite database, limited to "count" rows max.
    lib.base.coe(lib.db_sqlite.insert(conn, stats))
    lib.base.coe(lib.db_sqlite.cut(conn, _max=args.COUNT))
    lib.base.coe(lib.db_sqlite.commit(conn))

    # Additional CPU stats (number of events not as %; psutil>=4.1.0)
    # ctx_switches: number of context switches (voluntary + involuntary) since boot
    # interrupts: number of interrupts since boot
    # soft_interrupts: number of software interrupts since boot. Always set to 0 on Windows and
    # SunOS.
    # syscalls: number of system calls since boot. Always set to 0 on Linux.
    if lib.version.version(psutil.__version__) >= lib.version.version('4.1.0'):
        cpu_stats = psutil.cpu_stats()
        extstats['ctx_switches'] = getattr(cpu_stats, 'ctx_switches', 0)
        extstats['interrupts'] = getattr(cpu_stats, 'interrupts', 0)
        extstats['soft_interrupts'] = getattr(cpu_stats, 'soft_interrupts', 0)

    # this is for msg and perfdata
    cpu_usage = stats['cpu_usage'] + getattr(cpu_times_percent, 'nice', 0)

    # for the msg, sort by highest value, but without the cpu_usage sum
    del stats['cpu_usage']
    stats = lib.base.sort(stats, reverse=True)

    # now, calculate the WARN or CRIT.
    # overall state is not ok, if ...
    # in a row in any column there is a value above the threshold
    # and this is true for every row
    if get_from_db(conn, args.CRIT) == args.COUNT:
        state = STATE_CRIT
    elif get_from_db(conn, args.WARN) == args.COUNT:
        state = STATE_WARN

    lib.db_sqlite.close(conn)

    # build the message
    perfdata += lib.base.get_perfdata(
        'cpu-usage',
        cpu_usage,
        uom='%',
        warn=args.WARN,
        crit=args.CRIT,
        _min=0,
        _max=100,
    )

    msg_header = []  # for values > 0%
    msg_body = []  # for values == 0%
    for key, val in stats:
        if key == 'idle':
            continue
        part = f'{key}: {val:.1f}%'
        if val != 0:
            msg_header.append(part)
        else:
            msg_body.append(part)
        perfdata += lib.base.get_perfdata(
            key,
            val,
            uom='%',
            warn=None,
            crit=None,
            _min=0,
            _max=100,
        )
    msg = f'{cpu_usage:.1f}%'
    if msg_header:
        msg += ' - ' + ', '.join(msg_header)
    if msg_body:
        msg += '\n' + ', '.join(msg_body)

    if extstats:
        ext_parts = []
        for key, val in extstats.items():
            ext_parts.append(f'{key}: {lib.human.number2human(val)}')
            perfdata += lib.base.get_perfdata(
                key,
                val,
                uom='c',
                _min=0,
            )
        msg += '\n' + ', '.join(ext_parts)

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
