#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import base64
import configparser
import datetime
import sys

import lib.args
import lib.base
import lib.db_sqlite
import lib.disk
import lib.human
import lib.time
import lib.txt
import lib.url
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2025100601'

DESCRIPTION = """Detects fast-flapping Icinga services by counting state changes per service within a
configurable lookback interval. Queries the IcingaDB event history and alerts when
any service exceeds the configured number of state changes."""

DEFAULT_CRIT = 19
DEFAULT_INSECURE = False
DEFAULT_LOOKBACK = 4 * 3600
DEFAULT_NO_PROXY = False
DEFAULT_PWFILE = '/var/spool/icinga2/.icingaweb'
DEFAULT_TIMEOUT = 8
DEFAULT_WARN = 7


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '-c',
        '--critical',
        help='CRIT threshold for the number of state changes per service within the lookback period. '
        'Supports Nagios ranges. '
        'Default: %(default)s',
        dest='CRIT',
        type=int,
        default=DEFAULT_CRIT,
    )

    parser.add_argument(
        '--insecure',
        help=lib.args.help('--insecure'),
        dest='INSECURE',
        action='store_true',
        default=DEFAULT_INSECURE,
    )

    parser.add_argument(
        '--lookback',
        help='Time window in seconds to consider for state change counting. '
        'Default: %(default)s',
        dest='LOOKBACK',
        type=int,
        default=DEFAULT_LOOKBACK,
    )

    parser.add_argument(
        '--no-proxy',
        help=lib.args.help('--no-proxy'),
        dest='NO_PROXY',
        action='store_true',
        default=DEFAULT_NO_PROXY,
    )

    parser.add_argument(
        '--password',
        help='IcingaWeb password. Takes precedence over the value in `--pwfile`.',
        dest='PASSWORD',
    )

    parser.add_argument(
        '--pwfile',
        help='Path to a password file containing "url", "user" and "password" for IcingaWeb. '
        'Example: `--pwfile /var/spool/icinga2/.icingaweb`. '
        'Default: %(default)s',
        dest='PWFILE',
        default=DEFAULT_PWFILE,
    )

    parser.add_argument(
        '--timeout',
        help=lib.args.help('--timeout') + ' Default: %(default)s (seconds)',
        dest='TIMEOUT',
        type=int,
        default=DEFAULT_TIMEOUT,
    )

    parser.add_argument(
        '--url',
        help='IcingaDB event history URL including filter parameters. '
        'Takes precedence over the value in `--pwfile`. '
        'Example: `--url https://icinga/icingaweb2/icingadb/history?limit=250`.',
        dest='URL',
    )

    parser.add_argument(
        '--username',
        help='IcingaWeb username. Takes precedence over the value in `--pwfile`.',
        dest='USERNAME',
    )

    parser.add_argument(
        '-w',
        '--warning',
        help='WARN threshold for the number of state changes per service within the lookback period. '
        'Supports Nagios ranges. '
        'Default: %(default)s',
        dest='WARN',
        type=int,
        default=DEFAULT_WARN,
    )

    args, _ = parser.parse_known_args()
    return args


def get_data(args):
    """Login to Icinga, call the URL and return JSON data."""
    header = {}
    auth = f'{args.USERNAME}:{args.PASSWORD}'
    encoded_auth = lib.txt.to_text(base64.b64encode(lib.txt.to_bytes(auth)))
    header['Authorization'] = f'Basic {encoded_auth}'
    header['Accept'] = 'application/json'
    return lib.url.fetch_json(
        args.URL,
        header=header,
        insecure=args.INSECURE,
        no_proxy=args.NO_PROXY,
        timeout=args.TIMEOUT,
    )


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    have_creds = args.USERNAME and args.PASSWORD and args.URL
    have_pwfile = args.PWFILE
    if not (have_creds or have_pwfile):
        lib.base.cu(
            'The three parameters `--url`, `--username` and `--password` are required. '
            'Alternatively, you can provide a password file for IcingaWeb to read '
            'the "url", "user" or "password" from, using `--pwfile`.'
        )

    # if given, read password INI-file, but cmdline takes precedence
    config = configparser.ConfigParser()
    try:
        if config.read(args.PWFILE):
            # there is a password file
            for section in config.sections():
                for key, value in config.items(section):
                    if args.USERNAME is None and key.lower().startswith('user'):
                        args.USERNAME = value
                    if args.PASSWORD is None and key.lower() == 'password':
                        args.PASSWORD = value
                    if args.URL is None and key.lower() == 'url':
                        args.URL = value
    except configparser.Error as e:
        print(e)
        sys.exit(STATE_UNKNOWN)

    have_creds = args.USERNAME and args.PASSWORD and args.URL
    if not have_creds:
        lib.base.cu(
            'Either the username, password or URL is not specified'
            + f' in `{args.PWFILE}`.'
            if args.PWFILE
            else '.'
        )

    # init some vars
    msg = ''
    state = STATE_OK

    # create the db table
    definition = """
        host_id                 TEXT NOT NULL,
        host_display_name       TEXT NOT NULL,
        service_id              TEXT NOT NULL,
        service_display_name    TEXT NOT NULL,
        state_event_time        TEXT NOT NULL
    """
    conn = lib.base.coe(
        lib.db_sqlite.connect(
            filename='linuxfabrik-monitoring-plugins-icinga-topflap-services.db'
        ),
    )
    lib.base.coe(lib.db_sqlite.create_table(conn, definition, drop_table_first=True))
    lib.base.coe(lib.db_sqlite.create_index(conn, 'host_id, service_id'))

    now = lib.time.now()

    # fetch data, filter events and enrich them
    for event in lib.base.coe(get_data(args)):
        if event.get('object_type') != 'service':
            continue
        if event.get('service').get('state').get('in_downtime'):
            continue

        data = {}
        data['state_event_time'] = event.get('state').get('event_time')
        if not data['state_event_time']:
            continue
        data['state_event_time'] = lib.time.timestr2epoch(
            data['state_event_time'][:19],  # 2025-02-26T13:45:15.207+00:00
            pattern='%Y-%m-%dT%H:%M:%S',  # 2025-02-26T13:45:15
            tzinfo=datetime.timezone.utc,  # icinga works with UTC timezone format
        )
        # ignore if event is outside lookback interval
        if abs(now - data['state_event_time']) > args.LOOKBACK:
            continue

        data['service_display_name'] = event.get('service').get('display_name')
        # ignore events with "Waiting for Icinga DB to synchronize the config."
        if data['service_display_name'] is None:
            continue

        # ignore myself
        if (
            'top' in data['service_display_name'].lower()
            and 'flap' in data['service_display_name'].lower()
        ):
            continue

        data['host_id'] = event.get('host').get('id')
        data['host_display_name'] = event.get('host').get('display_name')
        data['service_id'] = event.get('service').get('id')
        lib.base.coe(lib.db_sqlite.insert(conn, data))
    lib.base.coe(lib.db_sqlite.commit(conn))

    # analyze data
    # from here on just working on the database
    data = lib.base.coe(
        lib.db_sqlite.select(
            conn,
            """
        select host_display_name, service_display_name, count(*) as cnt
        from perfdata
        where 1
        group by host_id, service_id
        order by cnt desc
        """,
        )
    )
    lib.db_sqlite.close(conn)

    for i, item in enumerate(data):
        data[i]['state'] = lib.base.get_state(
            item['cnt'],
            args.WARN,
            args.CRIT,
            _operator='range',
        )
        data[i]['state_hr'] = lib.base.state2str(data[i]['state'], empty_ok=False)
        state = lib.base.get_worst(state, data[i]['state'])

    # build the message
    msg = {
        STATE_CRIT: 'There are critical errors.',
        STATE_WARN: 'There are warnings.',
    }.get(state, 'Everything is ok.')
    msg += f' (lookback={lib.human.seconds2human(args.LOOKBACK)} warn={args.WARN} crit={args.CRIT})'
    if data:
        msg += '\n\n' + lib.base.get_table(
            data,
            [
                'host_display_name',
                'service_display_name',
                'cnt',
                'state_hr',
            ],
            header=[
                'Host',
                'Service',
                'Cnt',
                'State',
            ],
        )

    # over and out
    lib.base.oao(msg, state, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
