#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details.
"""

import argparse  # pylint: disable=C0413
import sys  # pylint: disable=C0413

import lib.args  # pylint: disable=C0413
import lib.base  # pylint: disable=C0413
import lib.db_sqlite  # pylint: disable=C0413
import lib.shell  # pylint: disable=C0413
import lib.lftest  # pylint: disable=C0413
import lib.txt  # pylint: disable=C0413
from lib.globals import (STATE_CRIT, STATE_OK,  # pylint: disable=C0413
                          STATE_UNKNOWN, STATE_WARN)

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2025022501'

DESCRIPTION = """This check prints various statistics for all running Docker containers, in much the
                same way as the Unix application top, using the "docker stats" command."""

DEFAULT_COUNT = 5           # measurements; if check runs once per minute, this is a 5 minute period
DEFAULT_WARN_CPU  = 80      # %
DEFAULT_CRIT_CPU  = 90      # %
DEFAULT_WARN_MEM  = 90      # %
DEFAULT_CRIT_MEM  = 95      # %


def parse_args():
    """Parse command line arguments using argparse.
    """
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V', '--version',
        action='version',
        version='%(prog)s: v{} by {}'.format(__version__, __author__)
    )

    parser.add_argument(
        '--always-ok',
        help='Always returns OK.',
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--count',
        help='Number of times the value must exceed specified thresholds before alerting. '
             'Default: %(default)s',
        dest='COUNT',
        type=int,
        default=DEFAULT_COUNT,
    )

    parser.add_argument(
        '--critical-cpu',
        help='Set the critical threshold CPU Usage Percentage. Default: %(default)s',
        default=DEFAULT_CRIT_CPU,
        dest='CRIT_CPU',
    )

    parser.add_argument(
        '--critical-mem',
        help='Set the critical threshold Memory Usage Percentage. Default: %(default)s',
        default=DEFAULT_CRIT_MEM,
        dest='CRIT_MEM',
    )

    parser.add_argument(
        '--full-name',
        help='Use full container name, for example `traefik_traefik.2.1idw12p2yqp`. If ommitted, '
             'the name will be shortened after the replica number (default behaviour).',
        dest='FULL_NAME',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--test',
        help='For unit tests. Needs "path-to-stdout-file,path-to-stderr-file,expected-retc".',
        dest='TEST',
        type=lib.args.csv,
    )

    parser.add_argument(
        '--warning-cpu',
        help='Set the warning threshold CPU Usage Percentage. Default: %(default)s',
        default=DEFAULT_WARN_CPU,
        dest='WARN_CPU',
    )

    parser.add_argument(
        '--warning-mem',
        help='Set the warning threshold Memory Usage Percentage. Default: %(default)s',
        default=DEFAULT_WARN_MEM,
        dest='WARN_MEM',
    )

    return parser.parse_args()


def get_cpu_from_db(conn, container, threshold):
    result = lib.base.coe(lib.db_sqlite.select(
        conn,
        '''
        SELECT count(*) as cnt
        FROM cpu
        WHERE container = :container and cpu_usage >= :threshold
        ''',
        {'container': container, 'threshold': threshold},
        fetchone=True,
    ))
    return int(result['cnt'])


def shorten(name):
    """
    >>> shorten('traefik_traefik.2.1idw12p2yqpxutlzkcwign4at')
    traefik_traefik.2
    """
    return name.rsplit('.', 1)[0]


def main():
    """The main function. Hier spielt die Musik.
    """

    # parse the command line, exit with UNKNOWN if it fails
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # create the db tables
    definition = '''
                container TEXT NOT NULL,
                cpu_usage REAL NOT NULL
        '''
    conn = lib.base.coe(
        lib.db_sqlite.connect(filename='linuxfabrik-monitoring-plugins-docker-stats.db'),
    )
    lib.base.coe(lib.db_sqlite.create_table(conn, definition, table='cpu'))
    lib.db_sqlite.create_index(conn, 'container', table='cpu')

    # fetch data
    if args.TEST is None:
        # we need cpu and memory from docker perspective
        cmd = 'docker info'
        stdout, stderr, retc = lib.base.coe(lib.shell.shell_exec(cmd, shell=True))
        if retc != 0:
            lib.base.oao('{}\n{}'.format(stderr, stdout), STATE_CRIT)
        try:
            strpos1 = stdout.find('CPUs: ') + 6
            strpos2 = stdout.find('\n', strpos1)
            host_cpus = int(stdout[strpos1:strpos2])
        except:
            lib.base.cu('Unable to compute docker info')

        # get the container statistics for all running containers
        cmd = 'docker stats --no-stream'
        # CONTAINER ID   NAME            CPU %     MEM USAGE / LIMIT     MEM %     NET I/O       BLOCK I/O        PIDS  # pylint: disable=C0301
        # 0d42d796123b   graylog         204.20%   215.6MiB / 3.702GiB   5.69%     578B / 0B     0B / 0B          21    # pylint: disable=C0301
        # 40cd9c3978b3   elasticsearch   188.77%   634.1MiB / 3.702GiB   16.73%    508B / 0B     1.44MB / 0B      26    # pylint: disable=C0301
        # b48543f756e6   mongo           0.27%     73.84MiB / 3.702GiB   1.95%     760B / 116B   6.59MB / 480kB   33    # pylint: disable=C0301
        stdout, stderr, retc = lib.base.coe(lib.shell.shell_exec(cmd, shell=True))
        if (stderr or retc != 0):
            lib.base.cu(stderr)
    else:
        # do not call the command, put in test data
        host_cpus = 1
        stdout, stderr, retc = lib.lftest.test(args.TEST)

    # init some vars
    msg = ''
    state = STATE_OK
    perfdata = lib.base.get_perfdata('cpu', host_cpus, None, None, None, 0, None)
    table_values = []

    # analyze data
    containers = lib.txt.mltext2array(stdout, skip_header=True, sort_key=1)
    for container in containers:
        try:
            if container[0] == '--':
                # container without ID and name == '--'
                continue
            name = container[1]
            # https://github.com/Linuxfabrik/monitoring-plugins/issues/586
            if not args.FULL_NAME:
                name = shorten(name)
            cpu_percent = container[2]
            mem_usage = container[3]
            mem_percent = container[6]
        except:
            continue

        # divide by number of cores (got by docker info)
        cpu_usage = round(float(cpu_percent.replace('%', '').strip()) / host_cpus, 1)
        perfdata += lib.base.get_perfdata('{}_cpu_usage'.format(name), cpu_usage, '%', args.WARN_CPU, args.CRIT_CPU, 0, 100) # pylint: disable=C0301

        # save trend data to local sqlite database, limited to "count" rows max.
        lib.base.coe(
            lib.db_sqlite.insert(conn, {'container': name, 'cpu_usage': cpu_usage}, table='cpu'),
        )
        lib.base.coe(lib.db_sqlite.cut(conn, _max=args.COUNT*len(containers), table='cpu'))

        # alert when container cpu_usage is exceeded
        # my container state is not ok, if in every of my historic rows the cpu value
        # is above the threshold
        if get_cpu_from_db(conn, name, args.CRIT_CPU) == args.COUNT:
            cpu_state = STATE_CRIT
        elif get_cpu_from_db(conn, name, args.WARN_CPU) == args.COUNT:
            cpu_state = STATE_WARN
        else:
            cpu_state = STATE_OK
        if cpu_state != STATE_OK:
            msg += '"{}" cpu {}% {}, '.format(name, cpu_usage, lib.base.state2str(cpu_state))
        state = lib.base.get_worst(cpu_state, state)

        # alert when container mem_usage is exceeded
        mem_usage = float(mem_percent.replace('%', '').strip())
        perfdata += lib.base.get_perfdata('{}_mem_usage'.format(name), mem_usage, '%', args.WARN_CPU, args.CRIT_CPU, 0, 100) # pylint: disable=C0301
        mem_state = lib.base.get_state(mem_usage, args.WARN_MEM, args.CRIT_MEM)
        if mem_state != STATE_OK:
            msg += '"{}" memory {}% {}, '.format(name, mem_usage, lib.base.state2str(mem_state))
        state = lib.base.get_worst(mem_state, state)

        table_values.append({
            'name': name,
            'cpu_usage': '{}{}'.format(cpu_usage, lib.base.state2str(cpu_state, prefix=' ')),
            'mem_usage': '{}{}'.format(mem_usage, lib.base.state2str(mem_state, prefix=' ')),
        })

    # we don't need the database any more: save data and close connection
    lib.db_sqlite.commit(conn)
    lib.db_sqlite.close(conn)

    # create output
    if state == STATE_OK:
        msg = 'Everything is ok.\n\n'
    else:
        msg = msg[:-2] + '\n\n'
    if len(table_values) > 0:
        msg += lib.base.get_table(
            table_values,
            ['name', 'cpu_usage', 'mem_usage'],
            header=['Container', 'CPU %', 'Mem % '],
        )

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:   # pylint: disable=W0703
        lib.base.cu()
