#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import sys

import lib.args
import lib.base
import lib.db_sqlite
import lib.lftest
import lib.shell
import lib.txt
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026041002'

DESCRIPTION = """Reports CPU and memory usage for all running Docker containers. CPU usage is
normalized by dividing by the number of available host CPU cores. CPU alerts only
trigger after the threshold has been exceeded for a configurable number of consecutive
check runs (default: 5), suppressing short spikes. Memory alerts trigger immediately.
Uses a local SQLite database for CPU trend tracking across runs. For Podman, use the
podman-stats check instead.
Requires root or sudo."""

DEFAULT_COUNT = (
    5  # measurements; if check runs once per minute, this is a 5 minute period
)
DEFAULT_WARN_CPU = 80  # %
DEFAULT_CRIT_CPU = 90  # %
DEFAULT_WARN_MEM = 90  # %
DEFAULT_CRIT_MEM = 95  # %


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--count',
        help=lib.args.help('--count') + ' Default: %(default)s',
        dest='COUNT',
        type=int,
        default=DEFAULT_COUNT,
    )

    parser.add_argument(
        '--critical-cpu',
        help='CRIT threshold for CPU usage in percent. Default: >= %(default)s',
        default=DEFAULT_CRIT_CPU,
        dest='CRIT_CPU',
    )

    parser.add_argument(
        '--critical-mem',
        help='CRIT threshold for memory usage in percent. Default: %(default)s',
        default=DEFAULT_CRIT_MEM,
        dest='CRIT_MEM',
    )

    parser.add_argument(
        '--full-name',
        help='Use the full container name, for example `traefik_traefik.2.1idw12p2yqp`. '
        'Without this flag, the name is shortened after the replica number.',
        dest='FULL_NAME',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--test',
        help=lib.args.help('--test'),
        dest='TEST',
        type=lib.args.csv,
    )

    parser.add_argument(
        '--warning-cpu',
        help='WARN threshold for CPU usage in percent. Default: >= %(default)s',
        default=DEFAULT_WARN_CPU,
        dest='WARN_CPU',
    )

    parser.add_argument(
        '--warning-mem',
        help='WARN threshold for memory usage in percent. Default: %(default)s',
        default=DEFAULT_WARN_MEM,
        dest='WARN_MEM',
    )

    args, _ = parser.parse_known_args()
    return args


def get_cpu_from_db(conn, container, threshold):
    """Return the number of rows where cpu_usage >= threshold for a container."""
    result = lib.base.coe(
        lib.db_sqlite.select(
            conn,
            """
        SELECT count(*) as cnt
        FROM cpu
        WHERE container = :container and cpu_usage >= :threshold
        """,
            {'container': container, 'threshold': threshold},
            fetchone=True,
        )
    )
    return int(result['cnt'])


def shorten(name):
    """
    >>> shorten('traefik_traefik.2.1idw12p2yqpxutlzkcwign4at')
    traefik_traefik.2
    """
    return name.rsplit('.', 1)[0]


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # create the db tables
    definition = """
                container TEXT NOT NULL,
                cpu_usage REAL NOT NULL
        """
    conn = lib.base.coe(
        lib.db_sqlite.connect(
            filename='linuxfabrik-monitoring-plugins-docker-stats.db'
        ),
    )
    lib.base.coe(lib.db_sqlite.create_table(conn, definition, table='cpu'))
    lib.db_sqlite.create_index(conn, 'container', table='cpu')

    # fetch data
    if args.TEST is None:
        # get the number of host CPUs
        stdout, stderr, retc = lib.base.coe(
            lib.shell.shell_exec('docker info'),
        )
        if retc != 0:
            lib.db_sqlite.close(conn)
            lib.base.oao(f'{stderr}\n{stdout}', STATE_CRIT)
        if 'server version:' not in stdout.lower():
            lib.db_sqlite.close(conn)
            lib.base.cu(
                'Unable to parse docker info output.'
                ' If you are using Podman, use the podman-stats check instead.'
            )
        strpos1 = stdout.find('CPUs: ') + 6
        strpos2 = stdout.find('\n', strpos1)
        host_cpus = int(stdout[strpos1:strpos2].strip())

        # get the container statistics for all running containers
        stdout, stderr, retc = lib.base.coe(
            lib.shell.shell_exec('docker stats --no-stream'),
        )
        if retc != 0:
            lib.db_sqlite.close(conn)
            lib.base.oao(stderr, STATE_CRIT)
    else:
        # do not call the command, put in test data
        host_cpus = 1
        stdout, stderr, retc = lib.lftest.test(args.TEST)

    # init some vars
    msg = ''
    state = STATE_OK
    perfdata = ''
    table_values = []

    # analyze data
    containers = lib.txt.mltext2array(stdout, skip_header=True, sort_key=1)
    for container in containers:
        try:
            if container[0] == '--':
                # container without ID and name == '--'
                continue
            name = container[1]
            # https://github.com/Linuxfabrik/monitoring-plugins/issues/586
            if not args.FULL_NAME:
                name = shorten(name)
            cpu_percent = container[2]
            mem_percent = container[6]
        except Exception:
            continue

        # divide by number of cores (got by docker info)
        cpu_usage = round(float(cpu_percent.replace('%', '').strip()) / host_cpus, 1)
        mem_usage = round(float(mem_percent.replace('%', '').strip()), 1)

        # save trend data to local sqlite database, limited to "count" rows max.
        lib.base.coe(
            lib.db_sqlite.insert(
                conn, {'container': name, 'cpu_usage': cpu_usage}, table='cpu'
            ),
        )
        lib.base.coe(
            lib.db_sqlite.cut(conn, _max=args.COUNT * len(containers), table='cpu')
        )

        # alert when container cpu_usage is exceeded
        # my container state is not ok, if in every of my historic rows the cpu value
        # is above the threshold
        if get_cpu_from_db(conn, name, args.CRIT_CPU) == args.COUNT:
            cpu_state = STATE_CRIT
        elif get_cpu_from_db(conn, name, args.WARN_CPU) == args.COUNT:
            cpu_state = STATE_WARN
        else:
            cpu_state = STATE_OK
        if cpu_state != STATE_OK:

            # build the message
            msg += f'"{name}" cpu {cpu_usage}% {lib.base.state2str(cpu_state)}, '
        state = lib.base.get_worst(cpu_state, state)

        # alert when container mem_usage is exceeded
        mem_state = lib.base.get_state(mem_usage, args.WARN_MEM, args.CRIT_MEM)
        if mem_state != STATE_OK:
            msg += f'"{name}" memory {mem_usage}% {lib.base.state2str(mem_state)}, '
        state = lib.base.get_worst(mem_state, state)

        table_values.append(
            {
                'name': name,
                'cpu_usage': f'{cpu_usage}{lib.base.state2str(cpu_state, prefix=" ")}',
                'mem_usage': f'{mem_usage}{lib.base.state2str(mem_state, prefix=" ")}',
            }
        )

    # we don't need the database any more: save data and close connection
    lib.db_sqlite.commit(conn)
    lib.db_sqlite.close(conn)

    # build perfdata
    perfdata += lib.base.get_perfdata(
        'containers_running',
        len(table_values),
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'cpu',
        host_cpus,
        _min=0,
    )

    # create output
    if state == STATE_OK:
        msg = f'Everything is ok, {len(table_values)} containers checked.\n\n'
    else:
        msg = msg[:-2] + '\n\n'
    if len(table_values) > 0:
        msg += lib.base.get_table(
            table_values,
            ['name', 'cpu_usage', 'mem_usage'],
            header=['Container', 'CPU %', 'Mem % '],
        )

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
