#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import re
import sys

import lib.args
import lib.base
import lib.db_mysql
import lib.human
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026052901'

DESCRIPTION = """Checks index sizes, fragmentation, and consistent engine and collation usage
across all schemas in MySQL/MariaDB, and lists the largest tables by combined data and index
size so storage growth can be traced before raising memory settings such as the InnoDB buffer
pool. Alerts on mixed storage engines or collations within a single schema, and on table sizes
that cross the optional --warning / --critical thresholds. Supports extended reporting via
--lengthy."""

DEFAULT_CRIT = ''
DEFAULT_DEFAULTS_FILE = '/var/spool/icinga2/.my.cnf'
DEFAULT_DEFAULTS_GROUP = 'client'
DEFAULT_IGNORE_SCHEMAS = ''
DEFAULT_IGNORE_TABLES = ''
DEFAULT_LENGTHY = False
DEFAULT_TIMEOUT = 3
DEFAULT_TOP = 10
DEFAULT_WARN = ''

# System schemas: skipped during all aggregates and per-schema checks. mysqltuner
# excludes the same set; `percona` is added because the Percona Server Toolkit installs a
# `percona` schema for its own bookkeeping and it is not user data.
SYSTEM_SCHEMAS = ('information_schema', 'mysql', 'percona', 'performance_schema', 'sys')


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '-c',
        '--critical',
        help='CRIT threshold for the size of a single table (data + index). '
        'Supports Nagios ranges with size qualifiers, '
        'for example `10G`, `5G:`, `@1G:10G`. '
        'Default: report only (no alerting).',
        dest='CRITICAL',
        default=DEFAULT_CRIT,
    )

    parser.add_argument(
        '--defaults-file',
        help='MySQL/MariaDB cnf file to read user, host and password '
        'from (instead of specifying them on the command line). '
        'Example: `/var/spool/icinga2/.my.cnf`. Default: %(default)s',
        dest='DEFAULTS_FILE',
        default=DEFAULT_DEFAULTS_FILE,
    )

    parser.add_argument(
        '--defaults-group',
        help=lib.args.help('--defaults-group') + ' Default: %(default)s',
        dest='DEFAULTS_GROUP',
        default=DEFAULT_DEFAULTS_GROUP,
    )

    parser.add_argument(
        '--ignore-schemas',
        help='Regular expression matched against `SCHEMA_NAME` (case-sensitive). Schemas '
        'whose name matches are skipped entirely (no aggregate contribution, no checks). '
        'Useful for known-mixed schemas that the admin cannot or does not want to fix '
        '(common with Icinga Director / Icinga Web 2 / Icinga DB schemas, which mix '
        'utf8 / utf8mb4 collations by design). System schemas are skipped unconditionally. '
        'Default: %(default)s. '
        'Example: `--ignore-schemas="^(icinga_director|icingaweb2|icingadb)$"`',
        dest='IGNORE_SCHEMAS',
        default=DEFAULT_IGNORE_SCHEMAS,
    )

    parser.add_argument(
        '--ignore-tables',
        help='Regular expression matched against `TABLE_NAME` (case-sensitive). Tables '
        'whose name matches are excluded from every aggregate and every per-schema check. '
        'Useful for muting noisy temporary or backup tables that legitimately differ from '
        'the schema-wide engine/collation. '
        'Default: %(default)s. '
        'Example: `--ignore-tables="^(tmp_|backup_)"`',
        dest='IGNORE_TABLES',
        default=DEFAULT_IGNORE_TABLES,
    )

    parser.add_argument(
        '--lengthy',
        help=lib.args.help('--lengthy') + ' Default: %(default)s',
        dest='LENGTHY',
        action='store_true',
        default=DEFAULT_LENGTHY,
    )

    parser.add_argument(
        '--timeout',
        help=lib.args.help('--timeout') + ' Default: %(default)s (seconds)',
        dest='TIMEOUT',
        type=int,
        default=DEFAULT_TIMEOUT,
    )

    parser.add_argument(
        '--top',
        help='Number of largest tables (by data + index size) to list. '
        'Default: %(default)s',
        dest='TOP',
        type=int,
        default=DEFAULT_TOP,
    )

    parser.add_argument(
        '-w',
        '--warning',
        help='WARN threshold for the size of a single table (data + index). '
        'Supports Nagios ranges with size qualifiers, '
        'for example `10G`, `5G:`, `@1G:10G`. '
        'Default: report only (no alerting).',
        dest='WARNING',
        default=DEFAULT_WARN,
    )

    args, _ = parser.parse_known_args()
    return args


def main():
    """The main function. This is where the magic happens."""

    # logic taken from mysqltuner.pl:mysql_databases(), verified in sync with
    # MySQLTuner (the per-database engine/collation/charset consistency
    # checks and the index-vs-data-size check are unchanged upstream since the
    # original port).
    #
    # Intentional deviation: the index-vs-data-size check additionally requires
    # one of the two sizes to exceed 10 MB. Tiny schemas (under a few MB)
    # routinely have proportionally larger indices than data, which is not
    # actionable for an admin and would generate constant noise.

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # fetch data
    mysql_connection = {
        'defaults_file': args.DEFAULTS_FILE,
        'defaults_group': args.DEFAULTS_GROUP,
        'timeout': args.TIMEOUT,
    }
    conn = lib.base.coe(lib.db_mysql.connect(mysql_connection))
    lib.base.coe(lib.db_mysql.check_privileges(conn, 'SELECT'))

    # The --ignore-tables and --ignore-schemas regexes are interpolated into the SQL
    # string. The values come from the CLI (admin-controlled), and any embedded single
    # quotes are escaped to keep the surrounding SQL literal valid. MySQL itself rejects
    # malformed regex patterns at query time and `lib.base.coe()` surfaces the error.
    ignore_tables_clause = ''
    if args.IGNORE_TABLES:
        safe_regex = args.IGNORE_TABLES.replace("'", "''")
        ignore_tables_clause = f" and TABLE_NAME NOT REGEXP '{safe_regex}'"
    ignore_schemas_clause = ''
    ignore_schemas_tbl_clause = ''
    if args.IGNORE_SCHEMAS:
        safe_regex = args.IGNORE_SCHEMAS.replace("'", "''")
        ignore_schemas_clause = f" and SCHEMA_NAME NOT REGEXP '{safe_regex}'"
        ignore_schemas_tbl_clause = f" and TABLE_SCHEMA NOT REGEXP '{safe_regex}'"

    # SCHEMA_NAME and others must be uppercase due to MySQL 8+
    excluded = ', '.join(f'"{s}"' for s in SYSTEM_SCHEMAS)
    sql = f"""
        select SCHEMA_NAME
        from information_schema.schemata
        where SCHEMA_NAME not in ({excluded})
            {ignore_schemas_clause};
    """  # nosec B608
    dblist = lib.base.coe(lib.db_mysql.select(conn, sql))

    # init some vars
    state = STATE_OK
    perfdata = ''
    results = {
        'index': '',
        'engine': '',
        'collation': '',
        'colcharset': '',
        'colcollation': '',
    }
    empty_schemas = []
    per_schema_rows = []
    total_data_size = 0
    total_index_size = 0
    total_rows = 0
    total_tables = 0
    # Table-size thresholds are Nagios ranges with size qualifiers (e.g. `10G`),
    # converted to byte ranges so lib.base.get_state() can evaluate them. None
    # means "report only" (no alerting).
    warn_bytes = lib.human.humanrange2bytes(args.WARNING) if args.WARNING else None
    crit_bytes = lib.human.humanrange2bytes(args.CRITICAL) if args.CRITICAL else None

    # analyze data
    # schema names come from MySQL itself (information_schema.schemata), not from user input
    for schema in dblist:
        schema_name = schema['SCHEMA_NAME']
        sql = f'''
            select TABLE_SCHEMA,
                sum(TABLE_ROWS) as sum_rows,
                sum(DATA_LENGTH) as sum_data,
                sum(INDEX_LENGTH) as sum_index,
                sum(DATA_LENGTH+INDEX_LENGTH) as sum_data_index,
                count(distinct ENGINE) as cnt_engines,
                count(TABLE_NAME) as cnt_tables,
                count(distinct(TABLE_COLLATION)) as cnt_collations
            from information_schema.tables
            where TABLE_SCHEMA="{schema_name}"
                {ignore_tables_clause}
            group by TABLE_SCHEMA
            order by TABLE_SCHEMA;
        '''  # nosec B608
        dbinfo = lib.base.coe(lib.db_mysql.select(conn, sql, fetchone=True))
        if dbinfo is None:
            empty_schemas.append(schema_name)
            continue

        # MySQL returns SQL NULL as Python None (not the string "NULL"); use is/is not
        # None for the guards.
        sum_data = dbinfo['sum_data']
        sum_index = dbinfo['sum_index']
        sum_rows = dbinfo['sum_rows']
        sum_total = dbinfo['sum_data_index']
        cnt_tables = dbinfo['cnt_tables']

        total_data_size += int(sum_data or 0)
        total_index_size += int(sum_index or 0)
        total_rows += int(sum_rows or 0)
        total_tables += int(cnt_tables or 0)

        if (
            sum_data is not None
            and sum_index is not None
            and (sum_data > 10 * 1024 * 1024 or sum_index > 10 * 1024 * 1024)
            and sum_data < sum_index
        ):
            idx_h = lib.human.bytes2human(sum_index)
            data_h = lib.human.bytes2human(sum_data)
            results['index'] += f'{schema_name} ({idx_h} / {data_h}), '

        if dbinfo['cnt_engines'] > 1:
            results['engine'] += f'{schema_name} ({dbinfo["cnt_engines"]}x), '

        if dbinfo['cnt_collations'] > 1:
            results['collation'] += f'{schema_name} ({dbinfo["cnt_collations"]}x), '

        sql = f'''
            select distinct(CHARACTER_SET_NAME)
            from information_schema.COLUMNS
            where CHARACTER_SET_NAME is not null
                and TABLE_SCHEMA = "{schema_name}"
                {ignore_tables_clause};
        '''  # nosec B608
        distinct_column_charset = lib.base.coe(lib.db_mysql.select(conn, sql))
        if len(distinct_column_charset) > 1:
            results['colcharset'] += (
                f'{schema_name} ({len(distinct_column_charset)}x), '
            )

        sql = f'''
            select distinct(COLLATION_NAME)
            from information_schema.COLUMNS
            where COLLATION_NAME is not null
                and TABLE_SCHEMA = "{schema_name}"
                {ignore_tables_clause};
        '''  # nosec B608
        distinct_column_collation = lib.base.coe(lib.db_mysql.select(conn, sql))
        if len(distinct_column_collation) > 1:
            results['colcollation'] += (
                f'{schema_name} ({len(distinct_column_collation)}x), '
            )

        # Collect per-schema details for the optional --lengthy table.
        sql = f'''
            select distinct ENGINE
            from information_schema.tables
            where TABLE_SCHEMA="{schema_name}"
                and ENGINE is not null
                {ignore_tables_clause};
        '''  # nosec B608
        engine_rows = lib.base.coe(lib.db_mysql.select(conn, sql))
        engines_list = sorted(r['ENGINE'] for r in engine_rows)

        sql = f'''
            select distinct TABLE_COLLATION
            from information_schema.tables
            where TABLE_SCHEMA="{schema_name}"
                and TABLE_COLLATION is not null
                {ignore_tables_clause};
        '''  # nosec B608
        tcoll_rows = lib.base.coe(lib.db_mysql.select(conn, sql))
        tcoll_list = sorted(r['TABLE_COLLATION'] for r in tcoll_rows)

        ccharset_list = sorted(r['CHARACTER_SET_NAME'] for r in distinct_column_charset)
        ccoll_list = sorted(r['COLLATION_NAME'] for r in distinct_column_collation)

        # Compact issue summary for the default (non-lengthy) table.
        issues = []
        if dbinfo['cnt_engines'] > 1:
            issues.append(f'{dbinfo["cnt_engines"]} engines')
        if len(tcoll_list) > 1:
            issues.append(f'{len(tcoll_list)} table collations')
        if len(ccharset_list) > 1:
            issues.append(f'{len(ccharset_list)} column charsets')
        if len(ccoll_list) > 1:
            issues.append(f'{len(ccoll_list)} column collations')

        per_schema_rows.append(
            {
                'schema': schema_name,
                'tables': cnt_tables or 0,
                'rows': sum_rows or 0,
                'data': lib.human.bytes2human(sum_data or 0),
                'index': lib.human.bytes2human(sum_index or 0),
                'total': lib.human.bytes2human(sum_total or 0),
                'engines': ', '.join(engines_list) or '-',
                'table_coll': ', '.join(tcoll_list) or '-',
                'col_charsets': ', '.join(ccharset_list) or '-',
                'col_coll': ', '.join(ccoll_list) or '-',
                'issues': ', '.join(issues) if issues else 'OK',
            }
        )

    # Top N tables by combined data + index size, across all scanned schemas.
    # This surfaces cleanup candidates before an admin blindly raises memory
    # settings such as innodb_buffer_pool_size. The list is reported by default;
    # it only drives the state when --warning / --critical size thresholds are
    # given.
    sql = f"""
        select TABLE_SCHEMA,
            TABLE_NAME,
            coalesce(DATA_LENGTH, 0) as data_size,
            coalesce(INDEX_LENGTH, 0) as index_size,
            coalesce(DATA_LENGTH, 0) + coalesce(INDEX_LENGTH, 0) as total_size
        from information_schema.tables
        where TABLE_TYPE = 'BASE TABLE'
            and TABLE_SCHEMA not in ({excluded})
            {ignore_schemas_tbl_clause}
            {ignore_tables_clause}
        order by total_size desc
        limit {int(args.TOP)};
    """  # nosec B608
    top_tables = lib.base.coe(lib.db_mysql.select(conn, sql))

    lib.db_mysql.close(conn)

    top_rows = []
    for tbl in top_tables:
        data_size = int(tbl['data_size'] or 0)
        index_size = int(tbl['index_size'] or 0)
        total_size = int(tbl['total_size'] or 0)
        # Thresholds apply to the combined footprint (the table is ranked by it),
        # so the per-row state and its marker live on the Total column.
        tbl_state = lib.base.get_state(
            total_size,
            warn_bytes,
            crit_bytes,
            _operator='range',
        )
        state = lib.base.get_worst(state, tbl_state)
        top_rows.append(
            {
                'schema': tbl['TABLE_SCHEMA'],
                'table': tbl['TABLE_NAME'],
                'data': lib.human.bytes2human(data_size),
                'index': lib.human.bytes2human(index_size),
                'total': f'{lib.human.bytes2human(total_size)}'
                f'{lib.base.state2str(tbl_state, prefix=" ")}',
            }
        )
        # Per-table perfdata so Grafana can trend each table's footprint. Sanitize
        # the `<schema>_<table>` label so exotic names stay perfdata-safe.
        label = (
            re.sub(r'\W+', '_', f'{tbl["TABLE_SCHEMA"]}_{tbl["TABLE_NAME"]}') + '_size'
        )
        perfdata += lib.base.get_perfdata(
            label,
            total_size,
            uom='B',
            warn=warn_bytes,
            crit=crit_bytes,
            _min=0,
        )

    # build the message
    # Independent sections joined with a single blank line between each. This
    # guarantees no `\n\n\n` (double blank lines) appear in the final output
    # regardless of which optional sections fire.

    # Section 1: header + bullet findings, or the OK summary
    bullets = []
    if results['index']:
        bullets.append(
            f'* Index size is larger than data size: {results["index"][:-2]}'
        )
    if results['engine']:
        bullets.append(
            f'* Mixed storage engines (use one engine for all tables in a schema):'
            f' {results["engine"][:-2]}'
        )
    if results['collation']:
        bullets.append(
            f'* Mixed table collations (use one collation for all tables in a schema):'
            f' {results["collation"][:-2]}'
        )
    if results['colcharset']:
        bullets.append(
            f'* Mixed column charsets (use one charset for all text-like columns'
            f' if possible): {results["colcharset"][:-2]}'
        )
    if results['colcollation']:
        bullets.append(
            f'* Mixed column collations (use one collation for all text-like columns'
            f' if possible): {results["colcollation"][:-2]}'
        )

    if bullets:
        state = lib.base.get_worst(state, STATE_WARN)

    # "Everything is ok." leads so admins see the verdict first; the
    # scanned-scope details follow. Format mirrors mysqltuner's per-database
    # stats but condensed to a single line. The state may already be WARN/CRIT
    # from the table-size thresholds above, so the header reflects the worst
    # state, not just the consistency findings.
    summary = (
        f'{len(dblist)} user schema(s) scanned,'
        f' {total_tables} table(s),'
        f' {lib.human.number2human(total_rows)} rows,'
        f' {lib.human.bytes2human(total_data_size)} data,'
        f' {lib.human.bytes2human(total_index_size)} indices.'
    )

    sections = []
    if state == STATE_OK:
        sections.append(f'Everything is ok. {summary}')
    else:
        header = (
            'There are critical errors.'
            if state == STATE_CRIT
            else 'There are warnings.'
        )
        if bullets:
            sections.append(f'{header}\n\n' + '\n'.join(bullets))
        else:
            sections.append(header)

    # Section 2: top tables by size. Always reported (the headline feature),
    # independent of --lengthy. The per-row STATE marker is the last thing on
    # the line so IcingaWeb's icon substitution does not break the table.
    if top_rows:
        thresholds = ''
        if args.WARNING or args.CRITICAL:
            thresholds = f' (warn={args.WARNING or "-"} crit={args.CRITICAL or "-"})'
        top_table = lib.base.get_table(
            top_rows,
            ['schema', 'table', 'data', 'index', 'total'],
            header=['Schema', 'Table', 'Data', 'Index', 'Total'],
        )
        sections.append(
            f'Top {len(top_rows)} tables by size{thresholds}:\n\n{top_table}'
        )

    # Section 3: empty-schemas info (common: lazy-init apps, fresh installs,
    # migration leftovers); does not change state, mirroring our "alert on the
    # actionable" stance.
    if empty_schemas:
        sections.append(
            f'Note: {len(empty_schemas)} empty schema(s) (no tables):'
            f' {", ".join(empty_schemas)}.'
        )

    # Section 4: per-schema breakdown table. Without --lengthy: compact summary
    # (Schema | Tables | Total Size | Issues). With --lengthy: full breakdown,
    # mirroring the per-database information mysqltuner emits when invoked with
    # `--dbstat`.
    if per_schema_rows:
        if args.LENGTHY:
            keys = [
                'schema',
                'tables',
                'rows',
                'data',
                'index',
                'total',
                'engines',
                'table_coll',
                'col_charsets',
                'col_coll',
            ]
            headers = [
                'Schema',
                'Tables',
                'Rows',
                'Data',
                'Index',
                'Total',
                'Engines',
                'Table Collations',
                'Column Charsets',
                'Column Collations',
            ]
        else:
            keys = ['schema', 'tables', 'total', 'issues']
            headers = ['Schema', 'Tables', 'Size', 'Issues']
        sections.append(lib.base.get_table(per_schema_rows, keys, header=headers))

    # rstrip each section: lib.base.get_table() appends a trailing newline, which
    # would turn the blank line between two adjacent tables into two blank lines.
    msg = '\n\n'.join(section.rstrip() for section in sections)

    perfdata += lib.base.get_perfdata('mysql_database_count', len(dblist), _min=0)
    perfdata += lib.base.get_perfdata(
        'mysql_total_data_size',
        total_data_size,
        uom='B',
        _min=0,
    )
    perfdata += lib.base.get_perfdata(
        'mysql_total_index_size',
        total_index_size,
        uom='B',
        _min=0,
    )
    perfdata += lib.base.get_perfdata('mysql_total_rows', total_rows, _min=0)
    perfdata += lib.base.get_perfdata('mysql_total_tables', total_tables, _min=0)

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
