From 258441019881c451686dbe537069228cc8e49612 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Isma=C3=ABl=20Bouya?= Date: Sat, 11 Jan 2020 15:35:24 +0100 Subject: [PATCH] Add some monitoring services (eriomem) --- modules/private/environment.nix | 1 + modules/private/monitoring/default.nix | 6 +- modules/private/monitoring/objects_common.nix | 1 + .../private/monitoring/objects_immae-eu.nix | 23 ----- .../monitoring/objects_monitoring-1.nix | 13 +++ .../private/monitoring/plugins/check_eriomem | 86 +++++++++++++++++++ 6 files changed, 106 insertions(+), 24 deletions(-) create mode 100755 modules/private/monitoring/plugins/check_eriomem diff --git a/modules/private/environment.nix b/modules/private/environment.nix index 50f153a..81b5df5 100644 --- a/modules/private/environment.nix +++ b/modules/private/environment.nix @@ -481,6 +481,7 @@ in ssh_secret_key = mkOption { type = str; description = "SSH secret key"; }; imap_login = mkOption { type = str; description = "IMAP login"; }; imap_password = mkOption { type = str; description = "IMAP password"; }; + eriomem_keys = mkOption { type = listOf (listOf str); description = "Eriomem keys"; default = []; }; nrdp_tokens = mkOption { type = listOf str; description = "Tokens allowed to push status update"; }; slack_url = mkOption { type = str; description = "Slack webhook url to push status update"; }; slack_channel = mkOption { type = str; description = "Slack channel to push status update"; }; diff --git a/modules/private/monitoring/default.nix b/modules/private/monitoring/default.nix index 2c2f693..e1357a7 100644 --- a/modules/private/monitoring/default.nix +++ b/modules/private/monitoring/default.nix @@ -38,6 +38,9 @@ let wrapProgram $out/check_imap_connection --prefix PATH : ${lib.makeBinPath [ pkgs.openssl ]} + wrapProgram $out/check_eriomem --prefix PATH : ${lib.makeBinPath [ + pkgs.s3cmd pkgs.python3 + ]} wrapProgram $out/notify_maison_bbc_by_email --prefix PATH : ${lib.makeBinPath [ pkgs.mailutils pkgs.gawk ]} @@ -55,7 +58,7 @@ let loadWarn = "8.0"; loadAlert = "10.0"; }; backup-2 = { - processWarn = "50"; processAlert = "60"; + processWarn = "60"; processAlert = "70"; loadWarn = "1.0"; loadAlert = "2.0"; }; monitoring-1 = { @@ -220,6 +223,7 @@ in $USER205$=${config.myEnv.monitoring.imap_password} $USER206$=${config.myEnv.monitoring.slack_channel} $USER207$=${config.myEnv.monitoring.slack_url} + $USER208$=${builtins.concatStringsSep "," (map (builtins.concatStringsSep ":") config.myEnv.monitoring.eriomem_keys)} ''; objectDefs = toObjects commonObjects + toObjects hostObjects diff --git a/modules/private/monitoring/objects_common.nix b/modules/private/monitoring/objects_common.nix index 7467306..15eee97 100644 --- a/modules/private/monitoring/objects_common.nix +++ b/modules/private/monitoring/objects_common.nix @@ -91,6 +91,7 @@ in ]; command = { check_dns = "$USER1$/check_dns -H $ARG1$ -s $HOSTADDRESS$ $ARG2$"; + check_eriomem = "$USER2$/check_eriomem $USER208$"; check_external_dns = "$USER1$/check_dns -H $ARG2$ -s $ARG1$ $ARG3$"; check_ftp_database = "$USER2$/check_ftp_database"; check_git = "$USER2$/check_git $USER203$"; diff --git a/modules/private/monitoring/objects_immae-eu.nix b/modules/private/monitoring/objects_immae-eu.nix index a1d1adb..a6337e9 100644 --- a/modules/private/monitoring/objects_immae-eu.nix +++ b/modules/private/monitoring/objects_immae-eu.nix @@ -64,18 +64,6 @@ in notification_interval = "0"; } - { - service_description = "rsync backup happened not too long ago"; - servicegroups = "webstatus-backup"; - } - - { - service_description = "eriomem backup is up and not full"; - freshness_threshold = "10800"; - notification_interval = "120"; - servicegroups = "webstatus-backup"; - } - { service_description = "postfix service is active"; } @@ -88,11 +76,6 @@ in service_description = "sshd service is active"; } - { - service_description = "httpd service is active"; - servicegroups = "webstatus-resources"; - } - { service_description = "postfix SSL is up to date"; } @@ -108,11 +91,5 @@ in servicegroups = "webstatus-email"; freshness_threshold = "1350"; } - - #### Web scenarios - { - service_description = "Default website site is running on ns208507.ip-188-165-209.eu"; - freshness_threshold = "1800"; - } ]; } diff --git a/modules/private/monitoring/objects_monitoring-1.nix b/modules/private/monitoring/objects_monitoring-1.nix index a46b684..f69d3ff 100644 --- a/modules/private/monitoring/objects_monitoring-1.nix +++ b/modules/private/monitoring/objects_monitoring-1.nix @@ -77,6 +77,19 @@ _webstatus_url = "imap.immae.eu"; } + # Backup services + { + service_description = "Eriomem backup is up and not full"; + host_name = "eldiron.immae.eu"; + use = "external-service"; + check_command = "check_eriomem"; + + check_interval = "120"; + notification_interval = "120"; + + servicegroups = "webstatus-backup"; + } + # DNS services { service_description = "eldiron dns is active and authoritative for aten.pro"; diff --git a/modules/private/monitoring/plugins/check_eriomem b/modules/private/monitoring/plugins/check_eriomem new file mode 100755 index 0000000..65ca790 --- /dev/null +++ b/modules/private/monitoring/plugins/check_eriomem @@ -0,0 +1,86 @@ +#!/usr/bin/env python +import os +import sys +import getopt +import signal +from subprocess import Popen, PIPE + +STATE_OK = 0 +STATE_WARNING = 1 +STATE_CRITICAL = 2 +STATE_UNKNOWN = 3 + +keys = sys.argv[1].split(",") + +def to_args(k): + access, secret = k.split(":", 1) + return [ + "s3cmd", + '-c=/dev/null', + '--no-check-certificate', + '--access_key={}'.format(access), + '--secret_key={}'.format(secret), + '--host=e.eriomem.net', + '--host-bucket=%(bucket)s.e.eriomem.net', + 'du' + ] + +ARGS1= to_args(keys[0]) +ARGS2= to_args(keys[1]) + +max_size = 1024*1024*1024*1024 +warning_percent = 99.75 +critical_percent = 99.95 + +def output(code, msg): + print(msg) + sys.exit(code) + +def main(): + def handler(signum, frame): + raise IOError + signal.signal(signal.SIGALRM, handler) + signal.alarm(60) + + try: + ps = [Popen(to_args(a), stdout=PIPE, stderr=PIPE) for a in keys] + outs = [p.communicate() for p in ps] + rets = [p.wait() for p in ps] + except IOError: + for p in ps: + os.kill(p.pid, signal.SIGTERM) + output(STATE_UNKNOWN, + "eriomem UNKNOWN - Command timeout after 60 seconds!") + + signal.alarm(0) + + if sum(rets) == 0: + usages = [int(out[0].decode().split("\n")[-2].split()[0]) for out in outs] + usage = sum(usages) + use_percent = 100 * usage / max_size + if use_percent > critical_percent: + output(STATE_CRITICAL, + "eriomem CRITICAL - bucket usage: %s (%s%%);| size=%s;;;;" % + (sizeof_fmt(usage), use_percent, usage)) + elif use_percent > warning_percent: + output(STATE_WARNING, + "eriomem WARNING - bucket usage: %s (%s%%);| size=%s;;;;" % + (sizeof_fmt(usage), use_percent, usage)) + else: + output(STATE_OK, + "eriomem OK - bucket usage: %s (%d%%);| size=%s;;;;" % + (sizeof_fmt(usage), use_percent, usage)) + else: + messages = "\n".join([out[0].decode() + out[1].decode() for out in outs]) + output(STATE_UNKNOWN, + "eriomem Unknown - Error in command\n" + messages) + +def sizeof_fmt(num): + for unit in ['','ko','Mo','Go','To','Po','Eo','Zo']: + if abs(num) < 1024.0: + return "%3.1f%s" % (num, unit) + num /= 1024.0 + return "%.1f%s%s" % (num, 'Yo') + +if __name__ == '__main__': + main() -- 2.41.0