From a97118c489a59d723538292214efaa10dfcb96df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Bouya?= Date: Tue, 16 Jun 2020 15:23:20 +0200 Subject: Add status engine website --- modules/private/monitoring/default.nix | 4 +- modules/private/monitoring/objects_common.nix | 2 +- modules/private/monitoring/plugins/check_command | 14 +-- modules/private/monitoring/plugins/check_emails | 6 +- modules/private/monitoring/plugins/check_eriomem | 16 +-- .../private/monitoring/plugins/check_ftp_database | 4 +- modules/private/monitoring/plugins/check_git | 17 +++- .../monitoring/plugins/check_imap_connection | 4 +- .../monitoring/plugins/check_last_file_date | 6 +- .../monitoring/plugins/check_mysql_replication | 6 +- .../monitoring/plugins/check_openldap_replication | 6 +- modules/private/monitoring/plugins/check_ovh_sms | 4 +- .../monitoring/plugins/check_postgres_replication | 6 +- .../monitoring/plugins/check_redis_replication | 6 +- modules/private/monitoring/status_engine.nix | 109 +++++++++++++++++++++ 15 files changed, 169 insertions(+), 41 deletions(-) create mode 100644 modules/private/monitoring/status_engine.nix (limited to 'modules/private/monitoring') diff --git a/modules/private/monitoring/default.nix b/modules/private/monitoring/default.nix index 349ba8a..8ae0b30 100644 --- a/modules/private/monitoring/default.nix +++ b/modules/private/monitoring/default.nix @@ -241,7 +241,6 @@ in services.naemon = { enable = true; extraConfig = '' - broker_module=${pkgs.naemon-livestatus}/lib/naemon-livestatus/livestatus.so ${config.services.naemon.runDir}/live use_syslog=1 log_initial_states=1 date_format=iso8601 @@ -249,6 +248,9 @@ in '' + lib.optionalString (!cfg.master) '' obsess_over_services=1 ocsp_command=notify-master + '' + lib.optionalString (cfg.master) '' + broker_module=${pkgs.naemon-livestatus}/lib/naemon-livestatus/livestatus.so ${config.services.naemon.runDir}/live + broker_module=${pkgs.status_engine.module}/lib/status-engine/naemon/statusengine-${pkgs.naemon.status_engine_version}.o use_service_perfdata=1 use_process_data=0 use_system_command_data=0 use_external_command_data=0 use_flapping_data=0 use_program_status_data=0 use_notification_data=0 use_contact_status_data=0 use_contact_notification_data=0 use_event_handler_data=0 use_object_data=0 ''; extraResource = '' $USER2$=${myplugins} diff --git a/modules/private/monitoring/objects_common.nix b/modules/private/monitoring/objects_common.nix index e9b1b51..2585c38 100644 --- a/modules/private/monitoring/objects_common.nix +++ b/modules/private/monitoring/objects_common.nix @@ -133,7 +133,7 @@ in notify-service-by-email = "ADMINEMAIL=\"$ADMINEMAIL$\" SERVICENOTIFICATIONID=\"$SERVICENOTIFICATIONID$\" SERVICEDESC=\"$SERVICEDESC$\" SERVICESTATE=\"$SERVICESTATE$\" SERVICEOUTPUT=\"$SERVICEOUTPUT$\" $USER2$/notify_by_email service \"$NOTIFICATIONTYPE$\" \"$HOSTALIAS$\" \"$LONGDATETIME$\" \"$CONTACTEMAIL$\" $OVE"; notify-by-slack = "HOST=\"$HOSTALIAS$\" SERVICESTATE=\"$SERVICESTATE$\" SERVICEDESC=\"$SERVICEDESC$\" SERVICEOUTPUT=\"$SERVICEOUTPUT$\" $USER2$/notify_by_slack \"$ARG1$\" \"$ARG2$\""; - notify-master = "$USER2$/send_nrdp.sh -u \"$USER200$\" -t \"$USER201$\" -H \"$HOSTADDRESS$\" -s \"$SERVICEDESC$\" -S \"$SERVICESTATEID$\" -o \"$SERVICEOUTPUT$\""; + notify-master = "$USER2$/send_nrdp.sh -u \"$USER200$\" -t \"$USER201$\" -H \"$HOSTADDRESS$\" -s \"$SERVICEDESC$\" -S \"$SERVICESTATEID$\" -o \"$SERVICEOUTPUT$ | $SERVICEPERFDATA$\""; }; timeperiod = { "24x7" = { diff --git a/modules/private/monitoring/plugins/check_command b/modules/private/monitoring/plugins/check_command index 55779fd..2b546c1 100755 --- a/modules/private/monitoring/plugins/check_command +++ b/modules/private/monitoring/plugins/check_command @@ -57,30 +57,30 @@ if ($other_command ne '') { chomp($cmd_result); if ($cmd_result =~ /sudo/i) { - print "$command CRITICAL - No sudo right to run the command\n"; + print "$command CRITICAL - No sudo right to run the command | result=1;;;;\n"; exit($STATE_UNKNOWN); } elsif ($expected_status ne '') { if ($? != $expected_status) { - print "$command CRITICAL - Response status $?\n"; + print "$command CRITICAL - Response status $? | result=1;;;;\n"; exit($STATE_CRITICAL); } else { - print "$command OK - Response status $?\n"; + print "$command OK - Response status $? | result=0;;;;\n"; exit($STATE_OK); } } elsif ($other_command ne '') { if ($cmd_result ne $other_cmd_result) { - print "$command CRITICAL - Expected output not matching other command output\n"; + print "$command CRITICAL - Expected output not matching other command output | result=1;;;;\n"; exit($STATE_CRITICAL); } else { - print "$command OK - Expected output matching other command output\n"; + print "$command OK - Expected output matching other command output | result=0;;;;\n"; exit($STATE_OK); } } else { if ($cmd_result !~ /$expected_output/) { - print "$command CRITICAL - Expected output not matching\n"; + print "$command CRITICAL - Expected output not matching | result=1;;;;\n"; exit($STATE_CRITICAL); } else { - print "$command OK - Expected output matching\n"; + print "$command OK - Expected output matching | result=0;;;;\n"; exit($STATE_OK); } } diff --git a/modules/private/monitoring/plugins/check_emails b/modules/private/monitoring/plugins/check_emails index 5a8453e..534e5a5 100755 --- a/modules/private/monitoring/plugins/check_emails +++ b/modules/private/monitoring/plugins/check_emails @@ -61,6 +61,7 @@ foreach my $line (@lines) { } my $output = ""; +my $old = 0; foreach my $email_from (@emails_to_expect) { my @email_split = split(/:/, $email_from); my $email = $email_split[0]; @@ -73,16 +74,17 @@ foreach my $email_from (@emails_to_expect) { if ($current_date - $email_date > 60*30) { $output = "$output$email ($found_emails{$email} from $from) "; } + $old = ($current_date - $email_date) > $old ? ($current_date - $email_date) : $old; } else { $output = "$output$email (missing) " } } if ($output ne '') { - print "Emails $host CRITICAL - expecting emails: $output\n"; + print "Emails $host CRITICAL - expecting emails: $output | timestamp=${old}s;;;;\n"; exit($STATE_CRITICAL); } else { - print "Emails $host OK\n"; + print "Emails $host OK | timestamp=${old}s;;;;\n"; exit($STATE_OK); } diff --git a/modules/private/monitoring/plugins/check_eriomem b/modules/private/monitoring/plugins/check_eriomem index f91d1e4..880b88a 100755 --- a/modules/private/monitoring/plugins/check_eriomem +++ b/modules/private/monitoring/plugins/check_eriomem @@ -47,7 +47,7 @@ def main(): for p in ps: os.kill(p.pid, signal.SIGTERM) output(STATE_UNKNOWN, - "eriomem UNKNOWN - Command timeout after 60 seconds!") + "Eriomem UNKNOWN - Command timeout after 60 seconds!") signal.alarm(0) @@ -57,20 +57,20 @@ def main(): use_percent = 100 * usage / max_size if use_percent > critical_percent: output(STATE_CRITICAL, - "eriomem CRITICAL - bucket usage: %s (%s%%);| size=%s;;;;" % - (sizeof_fmt(usage), use_percent, usage)) + "Eriomem CRITICAL - bucket usage: %s (%s%%);| size=%s;;;;" % + (sizeof_fmt(usage), use_percent, sizeof_fmt(usage))) elif use_percent > warning_percent: output(STATE_WARNING, - "eriomem WARNING - bucket usage: %s (%s%%);| size=%s;;;;" % - (sizeof_fmt(usage), use_percent, usage)) + "Eriomem WARNING - bucket usage: %s (%s%%);| size=%s;;;;" % + (sizeof_fmt(usage), use_percent, sizeof_fmt(usage))) else: output(STATE_OK, - "eriomem OK - bucket usage: %s (%d%%);| size=%s;;;;" % - (sizeof_fmt(usage), use_percent, usage)) + "Eriomem OK - bucket usage: %s (%d%%);| size=%s;;;;" % + (sizeof_fmt(usage), use_percent, sizeof_fmt(usage))) else: messages = "\n".join([out[0].decode() + out[1].decode() for out in outs]) output(STATE_UNKNOWN, - "eriomem Unknown - Error in command\n" + messages) + "Eriomem UNKNOWN - Error in command") def sizeof_fmt(num): for unit in ['','ko','Mo','Go','To','Po','Eo','Zo']: diff --git a/modules/private/monitoring/plugins/check_ftp_database b/modules/private/monitoring/plugins/check_ftp_database index 9a41424..f9cf579 100755 --- a/modules/private/monitoring/plugins/check_ftp_database +++ b/modules/private/monitoring/plugins/check_ftp_database @@ -3,9 +3,9 @@ OUT=$(echo "ls" | lftp -u test_ftp,test_ftp eldiron.immae.eu | grep it_works | wc -l) if [ "$OUT" -eq 1 ]; then - echo "ftp connection OK - access to ftp is working" + echo "ftp connection OK - access to ftp is working | ftp=1;;;;" exit 0 else - echo "ftp connection CRITICAL - no access to ftp" + echo "ftp connection CRITICAL - no access to ftp | ftp=0;;;;" exit 2 fi diff --git a/modules/private/monitoring/plugins/check_git b/modules/private/monitoring/plugins/check_git index 8c09925..e8fbb29 100755 --- a/modules/private/monitoring/plugins/check_git +++ b/modules/private/monitoring/plugins/check_git @@ -13,6 +13,7 @@ trap "rm -rf $TMPDIR" EXIT ERRORS="" OUTPUT="" +PERFS="" cd "$TMPDIR" OUT=$(git clone -q git://git.immae.eu/perso/Immae/Projets/Ruby/Monitor.git 2>&1) @@ -22,7 +23,10 @@ OUTPUT="$OUTPUT $OUT" fi if [ "$ERR" != 0 ]; then + PERFS="$PERFS git=0;;;;" ERRORS="$ERRORS git://" +else + PERFS="$PERFS git=1;;;;" fi rm -rf Monitor @@ -34,6 +38,9 @@ $OUT" fi if [ "$ERR" != 0 ]; then ERRORS="$ERRORS http://" + PERFS="$PERFS http=0;;;;" +else + PERFS="$PERFS http=1;;;;" fi rm -rf Monitor @@ -45,6 +52,9 @@ $OUT" fi if [ "$ERR" != 0 ]; then ERRORS="$ERRORS https://" + PERFS="$PERFS https=0;;;;" +else + PERFS="$PERFS https=1;;;;" fi rm -rf Monitor @@ -56,13 +66,16 @@ $OUT" fi if [ "$ERR" != 0 ]; then ERRORS="$ERRORS ssh" + PERFS="$PERFS ssh=0;;;;" +else + PERFS="$PERFS ssh=1;;;;" fi rm -rf Monitor if [ -n "$ERRORS" ]; then - echo "gitolite CRITICAL - impossible to clone via$ERRORS|$OUTPUT" + echo "gitolite CRITICAL - impossible to clone via$ERRORS | $PERFS" exit 2 else - echo "gitolite OK - ssh, git, http and https work|$OUTPUT" + echo "gitolite OK - ssh, git, http and https work | $PERFS" exit 0 fi diff --git a/modules/private/monitoring/plugins/check_imap_connection b/modules/private/monitoring/plugins/check_imap_connection index 304eae6..c1ab0dd 100755 --- a/modules/private/monitoring/plugins/check_imap_connection +++ b/modules/private/monitoring/plugins/check_imap_connection @@ -26,10 +26,10 @@ my $expected_result = "a OK Logged in"; chomp($cmd_result); if ($cmd_result !~ /$expected_result/) { - print "IMAP CRITICAL - Unable to connect via imaps\n"; + print "IMAP CRITICAL - Unable to connect via imaps | imap=0;;;;\n"; exit($STATE_CRITICAL); } else { - print "IMAP OK - imaps connected successfully\n"; + print "IMAP OK - imaps connected successfully | imap=1;;;;\n"; exit($STATE_OK); } diff --git a/modules/private/monitoring/plugins/check_last_file_date b/modules/private/monitoring/plugins/check_last_file_date index df45bbc..f51a258 100755 --- a/modules/private/monitoring/plugins/check_last_file_date +++ b/modules/private/monitoring/plugins/check_last_file_date @@ -15,12 +15,14 @@ if [ -z "$last_date" ]; then exit $STATE_UNKNOWN else LC_ALL=C last_date=$(printf "%.*f" 0 $last_date) + LC_ALL=C age=$(( $(date "+%s") - $last_date)) + max_age=$(( $hours * 60 * 60 )) min_date=$(date -d "$hours hours ago" "+%s") if [ "$min_date" -lt "$last_date" ]; then - echo "OK: Last file $(date -d @$last_date)" + echo "OK: Last file $(date -d @$last_date) | age=${age}s;;$max_age;;" exit $STATE_OK else - echo "CRITICAL: Last file $(date -d @$last_date)" + echo "CRITICAL: Last file $(date -d @$last_date) | age=${age}s;;$max_age;;" exit $STATE_CRITICAL fi fi diff --git a/modules/private/monitoring/plugins/check_mysql_replication b/modules/private/monitoring/plugins/check_mysql_replication index 8923928..4027f63 100755 --- a/modules/private/monitoring/plugins/check_mysql_replication +++ b/modules/private/monitoring/plugins/check_mysql_replication @@ -23,13 +23,13 @@ else LC_ALL=C lag=$(printf "%.*f" 0 $lag) if [[ $lag -lt 5 ]]; then - echo "OK - $output" + echo "OK - $output | time=${lag}s;5;10;;" exit $STATE_OK elif [[ $lag -lt 10 ]]; then - echo "WARNING - $output" + echo "WARNING - $output | time=${lag}s;5;10;;" exit $STATE_WARNING else - echo "CRITICAL - $output" + echo "CRITICAL - $output | time=${lag}s;5;10;;" exit $STATE_CRITICAL fi fi diff --git a/modules/private/monitoring/plugins/check_openldap_replication b/modules/private/monitoring/plugins/check_openldap_replication index b511ff2..7136ad5 100755 --- a/modules/private/monitoring/plugins/check_openldap_replication +++ b/modules/private/monitoring/plugins/check_openldap_replication @@ -42,13 +42,13 @@ else LC_ALL=C lag=$(printf "%.*f" 0 $lag) if [[ $offset -lt 5 ]]; then - echo "OK - $output" + echo "OK - $output | time=${offset}s;5;10;;" exit $STATE_OK elif [[ $offset -lt 10 ]]; then - echo "WARNING - $output" + echo "WARNING - $output | time=${offset}s;5;10;;" exit $STATE_WARNING else - echo "CRITICAL - $output" + echo "CRITICAL - $output | time=${offset}s;5;10;;" exit $STATE_CRITICAL fi fi diff --git a/modules/private/monitoring/plugins/check_ovh_sms b/modules/private/monitoring/plugins/check_ovh_sms index 141f82d..caf279c 100755 --- a/modules/private/monitoring/plugins/check_ovh_sms +++ b/modules/private/monitoring/plugins/check_ovh_sms @@ -15,10 +15,10 @@ try: result = client.get('/sms/{}'.format(account))["creditsLeft"] if result < 20: - print("SMS OVH Critical - Not enough sms left ({})|SMS {}".format(result, result)) + print("SMS OVH Critical - Not enough sms left ({})|SMS={};;;;".format(result, result)) sys.exit(2) else: - print("SMS OVH Ok - Enough sms left ({})|SMS {}".format(result, result)) + print("SMS OVH Ok - Enough sms left ({})|SMS={};;;;".format(result, result)) sys.exit(0) except Exception: print("SMS OVH UNKNOWN - Error during script") diff --git a/modules/private/monitoring/plugins/check_postgres_replication b/modules/private/monitoring/plugins/check_postgres_replication index 009b4d5..ff257a3 100755 --- a/modules/private/monitoring/plugins/check_postgres_replication +++ b/modules/private/monitoring/plugins/check_postgres_replication @@ -23,13 +23,13 @@ else LC_ALL=C lag=$(printf "%.*f" 0 $lag) if [[ $lag -lt 5 ]]; then - echo "OK - $output" + echo "OK - $output | time=${lag}s;5;10;0;" exit $STATE_OK elif [[ $lag -lt 10 ]]; then - echo "WARNING - $output" + echo "WARNING - $output | time=${lag}s;5;10;0;" exit $STATE_WARNING else - echo "CRITICAL - $output" + echo "CRITICAL - $output | time=${lag}s;5;10;0;" exit $STATE_CRITICAL fi fi diff --git a/modules/private/monitoring/plugins/check_redis_replication b/modules/private/monitoring/plugins/check_redis_replication index 7a884e1..6dbe4c4 100755 --- a/modules/private/monitoring/plugins/check_redis_replication +++ b/modules/private/monitoring/plugins/check_redis_replication @@ -26,13 +26,13 @@ else LC_ALL=C lag=$(printf "%.*f" 0 $lag) if [[ $lag -lt 5 && $offset -lt 5 ]]; then - echo "OK - $output" + echo "OK - $output | time=${lag}s;5;10;0; offset=${offset};5;10;0;" exit $STATE_OK elif [[ $lag -lt 10 && $offset -lt 10 ]]; then - echo "WARNING - $output" + echo "WARNING - $output | time=${lag}s;5;10;0; offset=${offset};5;10;0;" exit $STATE_WARNING else - echo "CRITICAL - $output" + echo "CRITICAL - $output | time=${lag}s;5;10;0; offset=${offset};5;10;0;" exit $STATE_CRITICAL fi fi diff --git a/modules/private/monitoring/status_engine.nix b/modules/private/monitoring/status_engine.nix new file mode 100644 index 0000000..c74a44d --- /dev/null +++ b/modules/private/monitoring/status_engine.nix @@ -0,0 +1,109 @@ +{ config, pkgs, lib, name, ... }: +let + package = pkgs.status_engine.worker.override { config_file = config.secrets.fullPaths."status_engine"; }; + env = config.myEnv.tools.status_engine; +in +{ + config = lib.mkIf config.myServices.status.enable { + systemd.services.gearmand = { + description = "Gearman daemon"; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + DynamicUser = true; + User = "gearmand"; + Type = "simple"; + ExecStart = "${pkgs.gearmand}/bin/gearmand --syslog -L 127.0.0.1 -q libsqlite3 --libsqlite3-db /var/lib/gearmand/gearmand.db --store-queue-on-shutdown -l stderr -P /run/gearmand/gearmand.pid"; + RuntimeDirectory = "gearmand"; + StateDirectory = "gearmand"; + }; + }; + + secrets.keys = [{ + dest = "status_engine"; + permissions = "0400"; + user = "naemon"; + group = "naemon"; + text = '' + node_name: ${name} + use_gearman: 1 + gearman: + address: 127.0.0.1 + port: 4730 + timeout: 1000 + use_rabbitmq: 0 + use_redis: 1 + redis: + address: 127.0.0.1 + port: 6379 + db: 0 + store_live_data_in_archive_backend: 1 + use_mysql: 1 + mysql: + host: ${env.mysql.remoteHost} + port: ${env.mysql.port} + username: ${env.mysql.user} + password: ${env.mysql.password} + database: ${env.mysql.database} + use_crate: 0 + number_of_bulk_records: 100 + max_bulk_delay: 5 + number_servicestatus_worker: 1 + number_hoststatus_worker: 1 + number_logentry_worker: 1 + number_statechange_worker: 1 + number_hostcheck_worker: 1 + number_servicecheck_worker: 1 + number_misc_worker: 1 + + process_perfdata: 1 + number_perfdata_worker: 1 + perfdata_backend: + - mysql + + check_for_commands: 1 + command_check_interval: 15 + external_command_file: /run/naemon/naemon.cmd + query_handler: /run/naemon/naemon.qh + submit_method: qh + + syslog_enabled: 1 + syslog_tag: statusengine-worker + + # Archive age + age_hostchecks: 5 + age_host_acknowledgements: 60 + age_host_notifications: 60 + age_host_statehistory: 365 + age_host_downtimes: 60 + age_servicechecks: 5 + age_service_acknowledgements: 60 + age_service_notifications: 60 + age_service_statehistory: 365 + age_service_downtimes: 60 + age_logentries: 5 + age_tasks: 1 + age_perfdata: 90 + + disable_http_proxy: 1 + ''; + }]; + + services.redis = rec { + enable = true; + bind = "127.0.0.1"; + }; + + systemd.services.status_engine_worker = { + description = "Status engine worker"; + after = [ "network.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + Type = "simple"; + Restart = "on-failure"; + User = "naemon"; + ExecStart = "${package}/bin/StatusengineWorker.php"; + }; + }; + }; +} -- cgit v1.2.3