]> git.immae.eu Git - perso/Immae/Config/Nix.git/blobdiff - modules/private/monitoring/objects_common.nix
Use nix expressions to build monitoring list
[perso/Immae/Config/Nix.git] / modules / private / monitoring / objects_common.nix
diff --git a/modules/private/monitoring/objects_common.nix b/modules/private/monitoring/objects_common.nix
new file mode 100644 (file)
index 0000000..8466fdb
--- /dev/null
@@ -0,0 +1,179 @@
+{ hostFQDN
+, processWarn ? "250"
+, processAlert ? "400"
+, loadWarn ? "8.0"
+, loadAlert ? "10.0"
+, mdadm
+, sudo
+, ...
+}:
+{
+  host = {
+    "${hostFQDN}" = {
+      alias = hostFQDN;
+      address = hostFQDN;
+      use = "linux-server";
+    };
+  };
+  service = [
+    {
+      service_description = "Size on root partition";
+      use = "local-service";
+      check_command = ["check_local_disk" "20%" "10%" "/"];
+    }
+    {
+      service_description = "Total number of process";
+      use = "local-service";
+      check_command = [
+        "check_local_procs"
+        processWarn
+        processAlert
+        "RSZDT"
+      ];
+    }
+    {
+      service_description = "Average load";
+      use = "local-service";
+      check_command = [
+        "check_local_load"
+        "${loadWarn},${loadWarn},${loadWarn}"
+        "${loadAlert},${loadAlert},${loadAlert}"
+      ];
+    }
+    {
+      service_description = "Swap usage";
+      use = "local-service";
+      check_command = ["check_local_swap" "20" "10"];
+    }
+    {
+      service_description = "Memory usage";
+      use = "local-service";
+      check_command = ["check_memory" "80" "90"];
+    }
+    {
+      service_description = "NTP is activated and working";
+      use = "local-service";
+      check_command = ["check_ntp"];
+    }
+    {
+      service_description = "No mdadm array is degraded";
+      use = "local-service";
+      check_command = [
+        "check_command_output"
+        "${mdadm}/bin/mdadm --monitor --scan -1"
+        "^$"
+        "-s 0 -r root"
+      ];
+    }
+  ];
+  command = {
+    check_local_disk = "$USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$";
+    check_local_procs = "$USER1$/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$";
+    check_local_load = "$USER1$/check_load -w $ARG1$ -c $ARG2$";
+    check_local_swap = "$USER1$/check_swap -n ok -w $ARG1$ -c $ARG2$";
+    check_memory = "$USER2$/check_mem.sh -w $ARG1$ -c $ARG2$";
+    check_command_output = "$USER2$/check_command -c \"$ARG1$\" -s 0 -o \"$ARG2$\" $ARG3$";
+    check_ntp = "$USER1$/check_ntp_time -t 30 -q -H 0.arch.pool.ntp.org";
+    check_postgresql_replication = "${sudo} -u postgres $USER2$/check_postgres_replication \"$ARG1$\" \"$ARG2$\" \"$ARG3$\"";
+    check_mailq = "$USER1$/check_mailq -s -w 1 -c 2";
+
+    check_host_alive = "$USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5";
+    check_last_file_date = "${sudo} -u \"$ARG3$\" $USER2$/check_last_file_date \"$ARG1$\" \"$ARG2$\"";
+
+    #  No notify commands, we go through master
+    #  notify_host_by_email = "SERVICENOTIFICATIONID=\"$SERVICENOTIFICATIONID$\" HOSTSTATE=\"$HOSTSTATE$\" HOSTOUTPUT=\"$HOSTOUTPUT$\" $USER2$/notify_by_email host \"$NOTIFICATIONTYPE$\" \"$HOSTALIAS$\" \"$LONGDATETIME$\" \"$CONTACTEMAIL$\" $OVE";
+    #  #$OVE is to force naemon to run via shell instead of execve which fails here
+    #  notify_service_by_email = "SERVICENOTIFICATIONID=\"$SERVICENOTIFICATIONID$\" SERVICEDESC=\"$SERVICEDESC$\" SERVICESTATE=\"$SERVICESTATE$\" SERVICEOUTPUT=\"$SERVICEOUTPUT$\" $USER2$/notify_by_email service \"$NOTIFICATIONTYPE$\" \"$HOSTALIAS$\" \"$LONGDATETIME$\" \"$CONTACTEMAIL$\" $OVE";
+    #  #sudo /usr/bin/strace -o /tmp/foo -vf -s 256 -u naemon $USER2$/notify_by_email
+    #  #$OVE is to force naemon to run via shell instead of execve which fails here
+
+    notify-master = "$USER2$/send_nrdp.sh -u \"$USER200$\" -t \"$USER201$\" -H \"$HOSTADDRESS$\" -s \"$SERVICEDESC$\" -S \"$SERVICESTATEID$\" -o \"$SERVICEOUTPUT$\"";
+  };
+  timeperiod = {
+    "24x7" = {
+      alias = "24 Hours A Day, 7 Days A Week";
+      monday = "00:00-24:00";
+      tuesday = "00:00-24:00";
+      wednesday = "00:00-24:00";
+      thursday = "00:00-24:00";
+      friday = "00:00-24:00";
+      saturday = "00:00-24:00";
+      sunday = "00:00-24:00";
+    };
+  };
+  contactgroup = {
+    admins = { alias = "Naemon Administrators"; };
+  };
+  # No contact, we go through master
+  # contact = {
+  #   immae = {
+  #     alias = "Immae";
+  #     use = "generic-contact";
+  #     email = "xxxxxxxxxxxxxxxx";
+  #   };
+  # };
+  templates = {
+    service = {
+      generic-service = {
+        active_checks_enabled = "1";
+        check_freshness = "0";
+        check_interval = "10";
+        check_period = "24x7";
+        contact_groups = "admins";
+        event_handler_enabled = "1";
+        flap_detection_enabled = "1";
+        is_volatile = "0";
+        max_check_attempts = "3";
+        notification_interval = "60";
+        notification_options = "w,u,c,r,f,s";
+        notification_period = "24x7";
+        notifications_enabled = "0"; # no notification since we send them to master
+        obsess_over_service = "1";
+        passive_checks_enabled = "1";
+        process_perf_data = "1";
+        retain_nonstatus_information = "1";
+        retain_status_information = "1";
+        retry_interval = "2";
+      };
+      local-service = {
+        use = "generic-service";
+        host_name = hostFQDN;
+        check_interval = "5";
+        max_check_attempts = "4";
+        retry_interval = "1";
+      };
+    };
+    # No contact, we go through master
+    # contact = {
+    #   generic-contact = {
+    #     host_notification_commands = "notify_host_by_email";
+    #     host_notification_options = "d,u,r,f,s";
+    #     host_notification_period = "24x7";
+    #     service_notification_commands = "notify_service_by_email";
+    #     service_notification_options = "w,u,c,r,f,s";
+    #     service_notification_period = "24x7";
+    #   };
+    # };
+    host = {
+      generic-host = {
+        event_handler_enabled = "1";
+        flap_detection_enabled = "1";
+        notification_period = "24x7";
+        notifications_enabled = "1";
+        process_perf_data = "1";
+        retain_nonstatus_information = "1";
+        retain_status_information = "1";
+      };
+      linux-server = {
+        check_command = "check_host_alive";
+        check_interval = "5";
+        check_period = "24x7";
+        contact_groups = "admins";
+        max_check_attempts = "10";
+        notification_interval = "120";
+        notification_options = "d,u,r,f";
+        retry_interval = "1";
+      };
+    };
+  };
+}