From: Ismaƫl Bouya Date: Sun, 1 Dec 2019 17:25:16 +0000 (+0100) Subject: Add monitoring for backup-2 X-Git-Url: https://git.immae.eu/?p=perso%2FImmae%2FConfig%2FNix.git;a=commitdiff_plain;h=9f2025235d888eb4a7822024a5fad2e288388814 Add monitoring for backup-2 --- diff --git a/modules/private/monitoring/conf/specific_backup-2.cfg b/modules/private/monitoring/conf/specific_backup-2.cfg new file mode 100644 index 0000000..ff91322 --- /dev/null +++ b/modules/private/monitoring/conf/specific_backup-2.cfg @@ -0,0 +1,36 @@ +# vim: filetype=nagios + +define service { + service_description Size on /backup2 partition + check_command check_local_disk!10%!5%!/backup2 + use local-service +} + +define command { + command_line /run/wrappers/bin/sudo -u "$ARG3$" $USER2$/check_last_file_date "$ARG1$" "$ARG2$" + command_name check_last_file_date +} + +define service { + service_description Last backup in /backup2/phare is not too old + check_command check_last_file_date!/backup2/phare!14!backup + use local-service +} + +define service { + service_description Last backup in /backup2/immae_eu is not too old + check_command check_last_file_date!/backup2/immae_eu!14!backup + use local-service +} + +define service { + service_description Last backup in /backup2/immae_fr is not too old + check_command check_last_file_date!/backup2/immae_fr!14!backup + use local-service +} + +define service { + service_description Last postgresql dump in /backup2/eldiron/postgresql_backup is not too old + check_command check_last_file_date!/backup2/eldiron/postgresql_backup!7!postgres + use local-service +} diff --git a/modules/private/monitoring/conf/specific_eldiron.cfg b/modules/private/monitoring/conf/specific_eldiron.cfg new file mode 100644 index 0000000..fd5a43d --- /dev/null +++ b/modules/private/monitoring/conf/specific_eldiron.cfg @@ -0,0 +1,29 @@ +# vim: filetype=nagios +# +define command { + command_line /run/wrappers/bin/sudo -u postgres $USER2$/check_postgres_replication "$ARG1$" "$ARG2$" "$ARG3$" + command_name check_postgresql_replication +} + +define service { + service_description Postgresql replication for backup-1 is up to date + check_command check_postgresql_replication!backup-1!/run/postgresql!5432 + use local-service +} + +define service { + service_description Postgresql replication for backup-2 is up to date + check_command check_postgresql_replication!backup-2!/run/postgresql!5432 + use local-service +} + +define service { + service_description mailq is empty + use local-service + check_command check_mailq +} + +define command { + command_name check_mailq + command_line $USER1$/check_mailq -s -w 1 -c 2 +} diff --git a/modules/private/monitoring/default.nix b/modules/private/monitoring/default.nix index c5acd40..6062aba 100644 --- a/modules/private/monitoring/default.nix +++ b/modules/private/monitoring/default.nix @@ -1,4 +1,4 @@ -{ config, myconfig, pkgs, lib, ... }: +{ config, myconfig, pkgs, lib, name, hostFQDN, ... }: let myplugins = pkgs.runCommand "buildplugins" { buildInputs = [ pkgs.makeWrapper pkgs.perl ]; @@ -13,16 +13,57 @@ let wrapProgram $out/check_mem.sh --prefix PATH : ${lib.makeBinPath [ pkgs.gnugrep pkgs.gawk pkgs.procps-ng ]} + wrapProgram $out/check_postgres_replication --prefix PATH : ${lib.makeBinPath [ + pkgs.postgresql + ]} ''; + defaultObjects = + let specific_file = ./conf + "/specific_" + name + ".cfg"; + in + builtins.readFile ./conf/local_services.cfg + + builtins.readFile ./conf/timeperiods.cfg + + builtins.readFile ./conf/services.cfg + + builtins.readFile ./conf/contacts.cfg + + builtins.readFile ./conf/hosts.cfg + + '' + define command { + command_line ${myplugins}/send_nrdp.sh -u "$USER200$" -t "$USER201$" -H "$HOSTADDRESS$" -s "$SERVICEDESC$" -S "$SERVICESTATEID$" -o "$SERVICEOUTPUT$" + command_name notify-master + } + define service { + service_description No mdadm array is degraded + use local-service + check_command check_command_output!${pkgs.mdadm}/bin/mdadm --monitor --scan -1!^$!-s 0 -r root + } + + define service { + name local-service + use generic-service + host_name ${hostFQDN} + check_interval 5 + max_check_attempts 4 + register 0 + retry_interval 1 + } + define host { + host_name ${hostFQDN} + alias ${hostFQDN} + address ${hostFQDN} + use linux-server + } + '' + + lib.strings.optionalString (builtins.pathExists specific_file) (builtins.readFile specific_file); in { options = { - myServices.monitoring.enable = lib.mkOption { - type = lib.types.bool; - default = false; - description = '' - Whether to enable monitoring. - ''; + myServices.monitoring = { + enable = lib.mkOption { + type = lib.types.bool; + default = false; + description = '' + Whether to enable monitoring. + ''; + }; }; }; @@ -39,6 +80,21 @@ in users = [ "naemon" ]; runAs = "root"; } + { + commands = [ + { command = "${myplugins}/check_postgres_replication *"; options = [ "NOPASSWD" ]; } + { command = "${myplugins}/check_last_file_date /backup2/*"; options = [ "NOPASSWD" ]; } + ]; + users = [ "naemon" ]; + runAs = "postgres"; + } + { + commands = [ + { command = "${myplugins}/check_last_file_date /backup2/*"; options = [ "NOPASSWD" ]; } + ]; + users = [ "naemon" ]; + runAs = "backup"; + } ]; environment.etc."mdadm.conf" = { enable = true; @@ -66,49 +122,7 @@ in $USER200$=${myconfig.env.monitoring.status_url} $USER201$=${myconfig.env.monitoring.status_token} ''; - objectDefs = builtins.readFile ./conf/local_services.cfg - + builtins.readFile ./conf/timeperiods.cfg - + builtins.readFile ./conf/services.cfg - + builtins.readFile ./conf/contacts.cfg - + builtins.readFile ./conf/hosts.cfg - + '' - define command { - command_line ${myplugins}/send_nrdp.sh -u "$USER200$" -t "$USER201$" -H "$HOSTADDRESS$" -s "$SERVICEDESC$" -S "$SERVICESTATEID$" -o "$SERVICEOUTPUT$" - command_name notify-master - } - define service { - service_description No mdadm array is degraded - use local-service - check_command check_command_output!${pkgs.mdadm}/bin/mdadm --monitor --scan -1!^$!-s 0 -r root - } - - define service { - service_description mailq is empty - use local-service - check_command check_mailq - } - - define command { - command_name check_mailq - command_line $USER1$/check_mailq -s -w 1 -c 2 - } - - define service { - name local-service - use generic-service - host_name eldiron.immae.eu - check_interval 5 - max_check_attempts 4 - register 0 - retry_interval 1 - } - define host { - host_name eldiron.immae.eu - alias eldiron.immae.eu - address eldiron.immae.eu - use linux-server - } - ''; + objectDefs = defaultObjects; }; }; } diff --git a/modules/private/monitoring/plugins/check_last_file_date b/modules/private/monitoring/plugins/check_last_file_date new file mode 100755 index 0000000..df45bbc --- /dev/null +++ b/modules/private/monitoring/plugins/check_last_file_date @@ -0,0 +1,26 @@ +#!/bin/bash + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +base_path=$1 +hours=$2 + +last_date=$(find $base_path -mindepth 1 -maxdepth 1 -printf "%T@\n" 2>/dev/null | sort | tail -n 1) + +if [ -z "$last_date" ]; then + echo "UNKNOWN: Could not read folder" + exit $STATE_UNKNOWN +else + LC_ALL=C last_date=$(printf "%.*f" 0 $last_date) + min_date=$(date -d "$hours hours ago" "+%s") + if [ "$min_date" -lt "$last_date" ]; then + echo "OK: Last file $(date -d @$last_date)" + exit $STATE_OK + else + echo "CRITICAL: Last file $(date -d @$last_date)" + exit $STATE_CRITICAL + fi +fi diff --git a/modules/private/monitoring/plugins/check_postgres_replication b/modules/private/monitoring/plugins/check_postgres_replication new file mode 100755 index 0000000..009b4d5 --- /dev/null +++ b/modules/private/monitoring/plugins/check_postgres_replication @@ -0,0 +1,35 @@ +#!/bin/bash + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +user=$1 +host=$2 +port=$3 + +lag=$(psql -h $host -p $port -A -t -c "SELECT COALESCE(EXTRACT(EPOCH FROM replay_lag),0) FROM pg_stat_replication WHERE usename='$user'" 2>/dev/null) +exit_code=$? + +if [[ $exit_code -ne 0 ]]; then + echo "UNKNOWN - Impossible to run psql command" + exit $STATE_UNKNOWN +elif [[ -z "$lag" ]]; then + echo "UNKNOWN - No replication found for $user" + exit $STATE_UNKNOWN +else + output="Replication lag for $user is ${lag}s" + LC_ALL=C lag=$(printf "%.*f" 0 $lag) + + if [[ $lag -lt 5 ]]; then + echo "OK - $output" + exit $STATE_OK + elif [[ $lag -lt 10 ]]; then + echo "WARNING - $output" + exit $STATE_WARNING + else + echo "CRITICAL - $output" + exit $STATE_CRITICAL + fi +fi diff --git a/modules/private/system/backup-2.nix b/modules/private/system/backup-2.nix index 1c5b7d8..3120a57 100644 --- a/modules/private/system/backup-2.nix +++ b/modules/private/system/backup-2.nix @@ -3,6 +3,7 @@ { boot.kernelPackages = pkgs.linuxPackages_latest; _module.args.privateFiles = privateFiles; + _module.args.hostFQDN = "backup-2.v.immae.eu"; imports = builtins.attrValues (import ../..); deployment = { @@ -48,6 +49,7 @@ ssh_key_private = myconfig.env.rsync_backup.ssh_key.private; }; + myServices.monitoring.enable = true; myServices.databasesReplication = { postgresql = { enable = true; diff --git a/modules/private/system/eldiron.nix b/modules/private/system/eldiron.nix index 5384bc2..d79cf16 100644 --- a/modules/private/system/eldiron.nix +++ b/modules/private/system/eldiron.nix @@ -3,6 +3,7 @@ { boot.kernelPackages = pkgs.linuxPackages_latest; _module.args.privateFiles = privateFiles; + _module.args.hostFQDN = "eldiron.immae.eu"; networking = { firewall.enable = true;