#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import json
import sys

import lib.args
import lib.base
import lib.db_sqlite
import lib.lftest
import lib.shell
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026041002'

DESCRIPTION = """Reports CPU and memory usage for all running Podman containers. CPU usage is
normalized by dividing by the number of available host CPU cores. CPU alerts only
trigger after the threshold has been exceeded for a configurable number of consecutive
check runs (default: 5), suppressing short spikes. Memory alerts trigger immediately.
Uses a local SQLite database for CPU trend tracking across runs. For Docker, use the
docker-stats check instead.
Requires root or sudo."""

DEFAULT_COUNT = (
    5  # measurements; if check runs once per minute, this is a 5 minute period
)
DEFAULT_WARN_CPU = 80  # %
DEFAULT_CRIT_CPU = 90  # %
DEFAULT_WARN_MEM = 90  # %
DEFAULT_CRIT_MEM = 95  # %


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--count',
        help=lib.args.help('--count') + ' Default: %(default)s',
        dest='COUNT',
        type=int,
        default=DEFAULT_COUNT,
    )

    parser.add_argument(
        '--critical-cpu',
        help='CRIT threshold for CPU usage, in percent. Default: >= %(default)s',
        default=DEFAULT_CRIT_CPU,
        dest='CRIT_CPU',
    )

    parser.add_argument(
        '--critical-mem',
        help='CRIT threshold for memory usage, in percent. Default: >= %(default)s',
        default=DEFAULT_CRIT_MEM,
        dest='CRIT_MEM',
    )

    parser.add_argument(
        '--full-name',
        help='Use the full container name instead of shortening it after the replica number. '
        'Example: `traefik_traefik.2.1idw12p2yqp`',
        dest='FULL_NAME',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--test',
        help=lib.args.help('--test'),
        dest='TEST',
        type=lib.args.csv,
    )

    parser.add_argument(
        '--warning-cpu',
        help='WARN threshold for CPU usage, in percent. Default: >= %(default)s',
        default=DEFAULT_WARN_CPU,
        dest='WARN_CPU',
    )

    parser.add_argument(
        '--warning-mem',
        help='WARN threshold for memory usage, in percent. Default: >= %(default)s',
        default=DEFAULT_WARN_MEM,
        dest='WARN_MEM',
    )

    args, _ = parser.parse_known_args()
    return args


def get_cpu_from_db(conn, container, threshold):
    """Return the number of rows where cpu_usage >= threshold for a container."""
    result = lib.base.coe(
        lib.db_sqlite.select(
            conn,
            """
        SELECT count(*) as cnt
        FROM cpu
        WHERE container = :container and cpu_usage >= :threshold
        """,
            {'container': container, 'threshold': threshold},
            fetchone=True,
        )
    )
    return int(result['cnt'])


def shorten(name):
    """
    >>> shorten('traefik_traefik.2.1idw12p2yqpxutlzkcwign4at')
    traefik_traefik.2
    """
    return name.rsplit('.', 1)[0]


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # create the db tables
    definition = """
                container TEXT NOT NULL,
                cpu_usage REAL NOT NULL
        """
    conn = lib.base.coe(
        lib.db_sqlite.connect(
            filename='linuxfabrik-monitoring-plugins-podman-stats.db'
        ),
    )
    lib.base.coe(lib.db_sqlite.create_table(conn, definition, table='cpu'))
    lib.db_sqlite.create_index(conn, 'container', table='cpu')

    # fetch data
    if args.TEST is None:
        # get the number of host CPUs
        stdout, stderr, retc = lib.base.coe(
            lib.shell.shell_exec('podman info --format json'),
        )
        if retc != 0:
            lib.db_sqlite.close(conn)
            lib.base.oao(f'{stderr}\n{stdout}', STATE_CRIT)
        try:
            podman_info = json.loads(stdout)
            host_cpus = podman_info['host']['cpus']
            host_images = podman_info['store']['imageStore']['number']
            host_ram = podman_info['host']['memTotal']
        except Exception:
            lib.db_sqlite.close(conn)
            lib.base.cu('Unable to parse podman info output as JSON.')

        # get the container statistics for all running containers
        stdout, stderr, retc = lib.base.coe(
            lib.shell.shell_exec("podman stats --no-stream --format '{{json .}}'"),
        )
        if retc != 0:
            lib.db_sqlite.close(conn)
            lib.base.oao(stderr, STATE_CRIT)
    else:
        # do not call the command, put in test data
        host_cpus = 1
        host_images = 0
        host_ram = 0
        stdout, stderr, retc = lib.lftest.test(args.TEST)

    # init some vars
    msg = ''
    state = STATE_OK
    perfdata = ''
    table_values = []
    total_block_input = 0
    total_block_output = 0
    total_net_rx = 0
    total_net_tx = 0

    # parse newline-delimited JSON output
    containers = []
    for line in stdout.strip().splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            containers.append(json.loads(line))
        except (json.JSONDecodeError, ValueError):
            continue

    # sort containers by name
    containers.sort(key=lambda c: c.get('Name', ''))

    # analyze data
    for container in containers:
        name = container.get('Name', '')
        if not name or name == '--':
            continue

        if not args.FULL_NAME:
            name = shorten(name)

        # divide by number of cores (got by podman info)
        cpu_usage = round(float(container.get('CPU', 0)) / host_cpus, 1)
        mem_usage = round(float(container.get('MemPerc', 0)), 1)

        # accumulate totals for aggregate perfdata
        total_block_input += int(container.get('BlockInput', 0))
        total_block_output += int(container.get('BlockOutput', 0))
        network = container.get('Network', {})
        for iface in network.values():
            total_net_rx += int(iface.get('RxBytes', 0))
            total_net_tx += int(iface.get('TxBytes', 0))

        # save trend data to local sqlite database, limited to "count" rows max.
        lib.base.coe(
            lib.db_sqlite.insert(
                conn, {'container': name, 'cpu_usage': cpu_usage}, table='cpu'
            ),
        )
        lib.base.coe(
            lib.db_sqlite.cut(conn, _max=args.COUNT * len(containers), table='cpu')
        )

        # alert when container cpu_usage is exceeded
        # my container state is not ok, if in every of my historic rows the cpu value
        # is above the threshold
        if get_cpu_from_db(conn, name, args.CRIT_CPU) == args.COUNT:
            cpu_state = STATE_CRIT
        elif get_cpu_from_db(conn, name, args.WARN_CPU) == args.COUNT:
            cpu_state = STATE_WARN
        else:
            cpu_state = STATE_OK
        if cpu_state != STATE_OK:

            # build the message
            msg += f'"{name}" cpu {cpu_usage}% {lib.base.state2str(cpu_state)}, '
        state = lib.base.get_worst(cpu_state, state)

        # alert when container mem_usage is exceeded
        mem_state = lib.base.get_state(mem_usage, args.WARN_MEM, args.CRIT_MEM)
        if mem_state != STATE_OK:
            msg += f'"{name}" memory {mem_usage}% {lib.base.state2str(mem_state)}, '
        state = lib.base.get_worst(mem_state, state)

        table_values.append(
            {
                'name': name,
                'cpu_usage': f'{cpu_usage}{lib.base.state2str(cpu_state, prefix=" ")}',
                'mem_usage': f'{mem_usage}{lib.base.state2str(mem_state, prefix=" ")}',
            }
        )

    # we don't need the database any more: save data and close connection
    lib.db_sqlite.commit(conn)
    lib.db_sqlite.close(conn)

    # build perfdata
    perfdata += lib.base.get_perfdata(
        'block_input',
        total_block_input,
        uom='B',
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'block_output',
        total_block_output,
        uom='B',
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'containers_running',
        len(table_values),
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'cpu',
        host_cpus,
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'images',
        host_images,
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'net_rx',
        total_net_rx,
        uom='B',
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'net_tx',
        total_net_tx,
        uom='B',
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'ram',
        host_ram,
        uom='B',
        _min=0,
    )

    # create output
    if state == STATE_OK:
        msg = f'Everything is ok, {len(table_values)} containers checked.\n\n'
    else:
        msg = msg[:-2] + '\n\n'
    if len(table_values) > 0:
        msg += lib.base.get_table(
            table_values,
            ['name', 'cpu_usage', 'mem_usage'],
            header=['Container', 'CPU %', 'Mem % '],
        )

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
