#!/usr/bin/env python3
# -*- coding: utf-8; py-indent-offset: 4 -*-
#
# Author:  Linuxfabrik GmbH, Zurich, Switzerland
# Contact: info (at) linuxfabrik (dot) ch
#          https://www.linuxfabrik.ch/
# License: The Unlicense, see LICENSE file.

# https://github.com/Linuxfabrik/monitoring-plugins/blob/main/CONTRIBUTING.md

"""See the check's README for more details."""

import argparse
import json
import os
import sys

import lib.args
import lib.base
import lib.lftest
import lib.redfish
import lib.txt
import lib.url
from lib.globals import STATE_CRIT, STATE_OK, STATE_UNKNOWN, STATE_WARN

__author__ = 'Linuxfabrik GmbH, Zurich/Switzerland'
__version__ = '2026060707'

DESCRIPTION = """Checks hardware sensor readings (temperature, voltage, fan speed, power) from the
Redfish Chassis collection via the Redfish API. Reads the modern Sensors collection where
available and falls back to the legacy Thermal and Power endpoints otherwise. Alerts when any
sensor reports a non-ok state."""

API_BASE = '/redfish/v1'
DEFAULT_CACHE_EXPIRE = 15  # minutes; keep below the controller's session timeout
DEFAULT_INSECURE = True
DEFAULT_NO_PROXY = False
DEFAULT_RETRIES = 3  # extra attempts on a failed Redfish request
DEFAULT_TIMEOUT = 8
DEFAULT_URL = 'https://localhost:5000'


def parse_args():
    """Parse command line arguments using argparse."""
    parser = argparse.ArgumentParser(description=DESCRIPTION)

    parser.add_argument(
        '-V',
        '--version',
        action='version',
        version=f'%(prog)s: v{__version__} by {__author__}',
    )

    parser.add_argument(
        '--always-ok',
        help=lib.args.help('--always-ok'),
        dest='ALWAYS_OK',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--brief',
        help='Hide items that are OK and show only those in WARN/CRIT state. '
        'Alerting is unaffected: all items still drive the overall check state. '
        'Default: %(default)s',
        dest='BRIEF',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--cache-expire',
        help=lib.args.help('--cache-expire') + ' Default: %(default)s',
        dest='CACHE_EXPIRE',
        type=int,
        default=DEFAULT_CACHE_EXPIRE,
    )

    parser.add_argument(
        '--ignore',
        help='Ignore items whose name matches this Python regular expression. '
        'Case-sensitive by default; use `(?i)` for case-insensitive matching. '
        'Can be specified multiple times.',
        dest='IGNORE',
        action='append',
        default=None,
    )

    parser.add_argument(
        '--insecure',
        help=lib.args.help('--insecure'),
        dest='INSECURE',
        action='store_true',
        default=DEFAULT_INSECURE,
    )

    parser.add_argument(
        '--inventory',
        help='Output the parsed components as JSON on stdout and exit OK, instead of '
        'running a health check. Use this to collect a hardware inventory: the JSON is a '
        'single object keyed by component type, so the output of several Redfish checks can '
        'be merged into one inventory document with `jq --slurp`. Ignores --brief, --match '
        'and --ignore. Default: %(default)s',
        dest='INVENTORY',
        action='store_true',
        default=False,
    )

    parser.add_argument(
        '--match',
        help='Only check items whose name matches this Python regular expression. '
        'Case-sensitive by default; use `(?i)` for case-insensitive matching. '
        'Can be specified multiple times.',
        dest='MATCH',
        action='append',
        default=None,
    )

    parser.add_argument(
        '--no-proxy',
        help=lib.args.help('--no-proxy'),
        dest='NO_PROXY',
        action='store_true',
        default=DEFAULT_NO_PROXY,
    )

    parser.add_argument(
        '--password',
        help='Redfish API password.',
        dest='PASSWORD',
    )

    parser.add_argument(
        '--retries',
        help='Number of extra attempts if a request to the Redfish API fails, before the '
        'check gives up. Helps against an occasionally slow or flaky management controller. '
        'Default: %(default)s',
        dest='RETRIES',
        type=int,
        default=DEFAULT_RETRIES,
    )

    parser.add_argument(
        '--test',
        help=lib.args.help('--test'),
        dest='TEST',
        type=lib.args.csv,
    )

    parser.add_argument(
        '--timeout',
        help=lib.args.help('--timeout') + ' Default: %(default)s (seconds)',
        dest='TIMEOUT',
        type=int,
        default=DEFAULT_TIMEOUT,
    )

    parser.add_argument(
        '--url',
        help='Redfish API URL. Default: %(default)s',
        dest='URL',
        default=DEFAULT_URL,
    )

    parser.add_argument(
        '--username',
        help='Redfish API username.',
        dest='USERNAME',
    )

    args, _ = parser.parse_known_args()
    return args


def load_test_fixture(test_args, path):
    # Replace the first element of args.TEST with the walk-specific
    # fixture path, read it via lib.lftest.test() and return the parsed
    # JSON. On a missing file or malformed JSON, exit STATE_UNKNOWN with
    # a helpful message instead of letting json.loads raise a traceback.
    if not os.path.isfile(path):
        lib.base.cu(f'Test fixture not found: "{path}".')
    test_args[0] = path
    stdout, _, _ = lib.lftest.test(test_args)
    try:
        return json.loads(stdout)
    except (json.JSONDecodeError, ValueError) as e:
        lib.base.cu(f'Test fixture "{path}" does not contain valid JSON: {e}')


def main():
    """The main function. This is where the magic happens."""

    # parse the command line
    try:
        args = parse_args()
    except SystemExit:
        sys.exit(STATE_UNKNOWN)

    # set default values for append parameters that were not specified
    if args.IGNORE is None:
        args.IGNORE = []
    if args.MATCH is None:
        args.MATCH = []

    # compile the item filter regexes once
    ignore_patterns = [
        lib.base.coe(p) for p in lib.txt.compile_regex(args.IGNORE, '--ignore')
    ]
    match_patterns = [
        lib.base.coe(p) for p in lib.txt.compile_regex(args.MATCH, '--match')
    ]

    # fetch data
    if args.TEST is None:
        if not args.URL.startswith('http'):
            lib.base.cu('--url parameter has to start with "http://" or https://".')
        header = {'Accept': 'application/json'}
        # reuse a cached Redfish session token across requests and runs, so we
        # do not create (and have the controller log) a new session each time
        header.update(lib.redfish.get_auth_header(args))
        # Entry point: the Chassis collection
        result = lib.base.coe(
            lib.url.fetch_json(
                f'{args.URL}{API_BASE}/Chassis',
                header=header,
                insecure=args.INSECURE,
                no_proxy=args.NO_PROXY,
                timeout=args.TIMEOUT,
                retries=args.RETRIES,
            )
        )
    else:
        # do not call the API, put in test data. Each API call in the
        # Redfish walk has an explicit fixture suffix, so the fixture
        # file names describe what they contain (chassises, chassis,
        # sensors, sensor-N, thermal).
        test_base = args.TEST[0]
        result = load_test_fixture(args.TEST, f'{test_base}-chassises')
    # "Members": [
    #     {
    #         "@odata.id": "/redfish/v1/Chassis/1U"
    #     }
    # ],
    if len(result.get('Members', [])) == 0:
        lib.base.cu('Nothing to check, no Redfish members found.')

    # init some vars
    msg = ''
    state = STATE_OK
    perfdata = ''
    member_count = 0
    # only allocated and populated in --inventory mode, so a normal health
    # check holds nothing extra in memory
    inventory = [] if args.INVENTORY else None

    # analyze data: follow each "Member" link, aggregate chassis,
    # sensor and thermal health into `state`.
    for member in result.get('Members', []):
        if args.TEST is None:
            # "/redfish/v1/Chassis/1U"
            chassis = lib.base.coe(
                lib.url.fetch_json(
                    f'{args.URL}{member["@odata.id"]}',
                    header=header,
                    insecure=args.INSECURE,
                    no_proxy=args.NO_PROXY,
                    timeout=args.TIMEOUT,
                    retries=args.RETRIES,
                )
            )
        else:
            chassis = load_test_fixture(args.TEST, f'{test_base}-chassis')
        chassis = lib.redfish.get_chassis(chassis)
        if chassis['Status_State'] not in ['Enabled', 'Quiesced']:
            continue
        member_count += 1
        chassis_state = lib.redfish.get_state(chassis)
        state = lib.base.get_worst(state, chassis_state)

        # build the message
        msg += 'Member:'
        msg += f' {chassis["Manufacturer"]}' if chassis['Manufacturer'] else ''
        msg += f' {chassis["Model"]}' if chassis['Model'] else ''
        msg += ', '
        msg += f'Power: {chassis["PowerState"]}, ' if chassis['PowerState'] else ''
        msg += f'LED: {chassis["IndicatorLED"]}, ' if chassis['IndicatorLED'] else ''
        msg += f'SKU: {chassis["SKU"]}, ' if chassis['SKU'] else ''
        msg += f'SerNo: {chassis["SerialNumber"]}, ' if chassis['SerialNumber'] else ''
        msg += f'PartNumber: {chassis["PartNumber"]}, ' if chassis['PartNumber'] else ''
        msg = msg[:-2] + lib.base.state2str(chassis_state, prefix=' ')

        # get the modern Sensors collection for the member, if advertised
        table_data = []
        if chassis['Sensors_@odata.id']:
            if args.TEST is None:
                sensors = lib.base.coe(
                    lib.url.fetch_json(
                        f'{args.URL}{chassis["Sensors_@odata.id"]}',
                        header=header,
                        insecure=args.INSECURE,
                        no_proxy=args.NO_PROXY,
                        timeout=args.TIMEOUT,
                        retries=args.RETRIES,
                    )
                )
            else:
                sensors = load_test_fixture(args.TEST, f'{test_base}-sensors')
            for sensor_idx, sensor in enumerate(sensors.get('Members', [])):
                if args.TEST is None:
                    sensor_data = lib.base.coe(
                        lib.url.fetch_json(
                            f'{args.URL}{sensor["@odata.id"]}',
                            header=header,
                            insecure=args.INSECURE,
                            no_proxy=args.NO_PROXY,
                            timeout=args.TIMEOUT,
                            retries=args.RETRIES,
                        )
                    )
                else:
                    sensor_data = load_test_fixture(
                        args.TEST,
                        f'{test_base}-sensor-{sensor_idx}',
                    )
                sensor_data = lib.redfish.get_chassis_sensors(sensor_data)
                if sensor_data['Status_State'] not in ['Enabled', 'Quiesced']:
                    continue
                # collect for --inventory before any display filter
                if args.INVENTORY:
                    inventory.append(dict(sensor_data, chassi_ids=chassis['Id']))
                # --match/--ignore: filter by item name
                item_name = sensor_data['Name'] or sensor_data.get('Id', '')
                if ignore_patterns and any(
                    p.search(item_name) for p in ignore_patterns
                ):
                    continue
                if match_patterns and not any(
                    p.search(item_name) for p in match_patterns
                ):
                    continue
                perfdata += lib.redfish.get_perfdata(sensor_data, 'Reading')
                # is the sensor reading within its thresholds?
                sensor_data_value_state = lib.redfish.get_sensor_state(
                    sensor_data, 'Reading'
                )
                state = lib.base.get_worst(state, sensor_data_value_state)
                sensor_data['Value'] = lib.base.state2str(
                    sensor_data_value_state, empty_ok=False
                )
                # is the sensor_data state healthy at all?
                sensor_data_state = lib.redfish.get_state(sensor_data)
                state = lib.base.get_worst(state, sensor_data_state)
                sensor_data['State'] = lib.base.state2str(
                    sensor_data_state, empty_ok=False
                )
                table_data.append(sensor_data)

        # get thermal values for the member (legacy fallback source and fan
        # redundancy). Fetched defensively: a chassis without a Thermal
        # endpoint must not abort the check.
        if args.TEST is None:
            success, thermal = lib.url.fetch_json(
                f'{args.URL}{member["@odata.id"]}/Thermal',
                header=header,
                insecure=args.INSECURE,
                no_proxy=args.NO_PROXY,
                timeout=args.TIMEOUT,
                retries=args.RETRIES,
            )
            if not success or not isinstance(thermal, dict):
                thermal = {}
        else:
            thermal = load_test_fixture(args.TEST, f'{test_base}-thermal')

        # fall back to the legacy Thermal and Power endpoints when the modern
        # Sensors collection is absent or empty. Older BMCs (e.g. iLO4) only
        # expose Thermal (Temperatures, Fans) and Power (Voltages, power
        # supplies), so without this fallback they would report no readings.
        if not table_data:
            if args.TEST is None:
                success, power = lib.url.fetch_json(
                    f'{args.URL}{member["@odata.id"]}/Power',
                    header=header,
                    insecure=args.INSECURE,
                    no_proxy=args.NO_PROXY,
                    timeout=args.TIMEOUT,
                    retries=args.RETRIES,
                )
                if not success or not isinstance(power, dict):
                    power = {}
            else:
                power = load_test_fixture(args.TEST, f'{test_base}-power')
            # (parsed item, key holding the reading, display unit)
            legacy = [
                (
                    lib.redfish.get_chassis_thermal_temperatures(t),
                    'ReadingCelsius',
                    'Cel',
                )
                for t in thermal.get('Temperatures', [])
            ]
            legacy += [
                (lib.redfish.get_chassis_thermal_fans(f), 'Reading', '')
                for f in thermal.get('Fans', [])
            ]
            legacy += [
                (lib.redfish.get_chassis_power_voltages(v), 'ReadingVolts', 'V')
                for v in power.get('Voltages', [])
            ]
            for item, reading_key, units in legacy:
                if item['Status_State'] not in ['Enabled', 'Quiesced']:
                    continue
                # collect for --inventory before any display filter
                if args.INVENTORY:
                    inventory.append(dict(item, chassi_ids=chassis['Id']))
                # --match/--ignore: filter by item name
                item_name = item['Name'] or item.get('Id', '')
                if ignore_patterns and any(
                    p.search(item_name) for p in ignore_patterns
                ):
                    continue
                if match_patterns and not any(
                    p.search(item_name) for p in match_patterns
                ):
                    continue
                # adapt the legacy Upper/LowerThreshold* field names to the
                # modern sensor schema so get_sensor_state()/get_perfdata()
                # can consume them unchanged
                item['Reading'] = item.get(reading_key, '')
                if units:
                    item['ReadingUnits'] = units
                item['Thresholds_UpperCritical'] = item.get(
                    'UpperThresholdCritical', ''
                )
                item['Thresholds_UpperCaution'] = item.get(
                    'UpperThresholdNonCritical', ''
                )
                item['Thresholds_LowerCritical'] = item.get(
                    'LowerThresholdCritical', ''
                )
                item['Thresholds_LowerCaution'] = item.get(
                    'LowerThresholdNonCritical', ''
                )
                perfdata += lib.redfish.get_perfdata(item, 'Reading')
                value_state = lib.redfish.get_sensor_state(item, 'Reading')
                state = lib.base.get_worst(state, value_state)
                item['Value'] = lib.base.state2str(value_state, empty_ok=False)
                item_state = lib.redfish.get_state(item)
                state = lib.base.get_worst(state, item_state)
                item['State'] = lib.base.state2str(item_state, empty_ok=False)
                table_data.append(item)
            # power supplies are evaluated on health only (no reading thresholds)
            for psu in power.get('PowerSupplies', []):
                psu = lib.redfish.get_chassis_power_powersupplies(psu)
                if psu['Status_State'] not in ['Enabled', 'Quiesced']:
                    continue
                psu['Name'] = psu.get('Model') or 'Power Supply'
                # collect for --inventory before any display filter
                if args.INVENTORY:
                    inventory.append(dict(psu, chassi_ids=chassis['Id']))
                # --match/--ignore: filter by item name
                item_name = psu['Name'] or psu.get('Id', '')
                if ignore_patterns and any(
                    p.search(item_name) for p in ignore_patterns
                ):
                    continue
                if match_patterns and not any(
                    p.search(item_name) for p in match_patterns
                ):
                    continue
                psu['PhysicalContext'] = 'PowerSupply'
                psu['Reading'] = psu.get('LastPowerOutputWatts', '')
                psu['ReadingUnits'] = 'W'
                perfdata += lib.redfish.get_perfdata(psu, 'Reading')
                psu_state = lib.redfish.get_state(psu)
                state = lib.base.get_worst(state, psu_state)
                psu['Value'] = lib.base.state2str(psu_state, empty_ok=False)
                psu['State'] = lib.base.state2str(psu_state, empty_ok=False)
                table_data.append(psu)
            # power control reports the chassis-wide power consumption. Cisco
            # returns a single object where the schema expects a list, so
            # normalize it before iterating.
            power_control = power.get('PowerControl', [])
            if isinstance(power_control, dict):
                power_control = [power_control]
            for pc in power_control:
                pc = lib.redfish.get_chassis_power_powercontrol(pc)
                if pc['Status_State'] not in ['Enabled', 'Quiesced']:
                    continue
                pc['Name'] = pc.get('Name') or 'Power Control'
                # collect for --inventory before any display filter
                if args.INVENTORY:
                    inventory.append(dict(pc, chassi_ids=chassis['Id']))
                # --match/--ignore: filter by item name
                item_name = pc['Name'] or pc.get('MemberId', '')
                if ignore_patterns and any(
                    p.search(item_name) for p in ignore_patterns
                ):
                    continue
                if match_patterns and not any(
                    p.search(item_name) for p in match_patterns
                ):
                    continue
                pc['PhysicalContext'] = 'PowerControl'
                pc['Reading'] = pc.get('PowerConsumedWatts', '')
                pc['ReadingUnits'] = 'W'
                perfdata += lib.redfish.get_perfdata(pc, 'Reading')
                pc_state = lib.redfish.get_state(pc)
                state = lib.base.get_worst(state, pc_state)
                pc['Value'] = lib.base.state2str(pc_state, empty_ok=False)
                pc['State'] = lib.base.state2str(pc_state, empty_ok=False)
                table_data.append(pc)

        if args.BRIEF:
            table_data = [
                r
                for r in table_data
                if not (r.get('State') == '[OK]' and r.get('Value') == '[OK]')
            ]
        if table_data:
            keys = [
                'Name',
                'PhysicalContext',
                'Reading',
                'ReadingUnits',
                'Value',
                'State',
            ]
            headers = ['Sensor', 'Location', 'Reading', 'Unit', 'Value', 'State']
            msg += '\n\n' + lib.base.get_table(table_data, keys, header=headers)

        # redundancy
        table_data = []
        for redundancy in thermal.get('Redundancy', []):
            redundancy = lib.redfish.get_chassis_thermal_redundancy(redundancy)
            if redundancy['Status_State'] not in ['Enabled', 'Quiesced']:
                continue
            perfdata += lib.redfish.get_perfdata(redundancy)
            # is the redundancy state healthy at all?
            redundancy_state = lib.redfish.get_state(redundancy)
            state = lib.base.get_worst(state, redundancy_state)
            redundancy['State'] = lib.base.state2str(redundancy_state, empty_ok=False)
            table_data.append(redundancy)
        if args.BRIEF:
            table_data = [r for r in table_data if r.get('State') != '[OK]']
        if table_data:
            keys = ['Name', 'Mode', 'State']
            headers = ['Redundancy', 'Mode', 'State']
            msg += '\n\n' + lib.base.get_table(table_data, keys, header=headers)

        msg += '\n\n'

    # --inventory: emit the collected components as JSON and exit before
    # building the human-readable message and perfdata
    if args.INVENTORY:
        print(
            json.dumps(
                {'sensor': inventory}, ensure_ascii=False, indent=4, sort_keys=True
            )
        )
        sys.exit(STATE_OK)

    # build the message
    members = lib.txt.pluralize('member', member_count)
    if state == STATE_CRIT:
        msg = (
            f'Checked sensors on {member_count} {members}.'
            f' There are critical errors.\n\n'
        ) + msg
    elif state == STATE_WARN:
        msg = (
            f'Checked sensors on {member_count} {members}. There are warnings.\n\n'
        ) + msg
    else:
        msg = (
            f'Everything is ok, checked sensors on {member_count} {members}.\n\n'
        ) + msg

    # over and out
    lib.base.oao(msg, state, perfdata, always_ok=args.ALWAYS_OK)


if __name__ == '__main__':
    try:
        main()
    except Exception:
        lib.base.cu()
