]> git.immae.eu Git - perso/Immae/Config/Nix.git/blob - modules/private/monitoring/objects_common.nix
Adjust load warnings for monitoring
[perso/Immae/Config/Nix.git] / modules / private / monitoring / objects_common.nix
1 { hostFQDN
2 , hostName
3 , interface ? "eth0"
4 , processWarn ? "250"
5 , processAlert ? "400"
6 , loadWarn ? "8.0"
7 , load5Warn ? loadWarn
8 , load15Warn ? load5Warn
9 , loadAlert ? "10.0"
10 , load5Alert ? loadAlert
11 , load15Alert ? load5Alert
12 , mdadm
13 , sudo
14 , master
15 , lib
16 , ...
17 }:
18 let
19 defaultPassiveInfo = {
20 filter = lib.attrsets.filterAttrs
21 (k: v: builtins.elem k ["service_description"] || builtins.substring 0 1 k == "_");
22 use = "external-passive-service";
23 freshness_threshold = "450";
24 retry_interval = "1";
25 servicegroups = "webstatus-resources";
26 host_name = hostFQDN;
27 };
28 in
29 {
30 host = {
31 "${hostFQDN}" = {
32 alias = hostFQDN;
33 address = hostFQDN;
34 use = "linux-server";
35 hostgroups = "webstatus-hosts";
36 _webstatus_name = hostName;
37 };
38 };
39 service = [
40 {
41 passiveInfo = defaultPassiveInfo;
42 service_description = "Size on root partition";
43 use = "local-service";
44 check_command = ["check_local_disk" "20%" "10%" "/"];
45 }
46 {
47 passiveInfo = defaultPassiveInfo;
48 service_description = "Total number of process";
49 use = "local-service";
50 check_command = [
51 "check_local_procs"
52 processWarn
53 processAlert
54 "RSZDT"
55 ];
56 }
57 {
58 passiveInfo = defaultPassiveInfo;
59 service_description = "Network bandwidth";
60 use = "local-service";
61 check_interval = "2";
62 max_check_attempts = "20";
63 retry_interval = "2";
64 check_command = [
65 "check_local_bandwidth"
66 interface
67 "20480" # kb/s
68 "51200" # kb/s
69 ];
70 }
71 {
72 passiveInfo = defaultPassiveInfo;
73 service_description = "Average load";
74 use = "local-service";
75 check_command = [
76 "check_local_load"
77 "${loadWarn},${load5Warn},${load15Warn}"
78 "${loadAlert},${load5Alert},${load15Alert}"
79 ];
80 }
81 {
82 passiveInfo = defaultPassiveInfo;
83 service_description = "Swap usage";
84 use = "local-service";
85 check_command = ["check_local_swap" "20" "10"];
86 }
87 {
88 passiveInfo = defaultPassiveInfo;
89 service_description = "Memory usage";
90 use = "local-service";
91 check_command = ["check_memory" "80" "90"];
92 }
93 {
94 passiveInfo = defaultPassiveInfo;
95 service_description = "NTP is activated and working";
96 use = "local-service";
97 check_command = ["check_ntp"];
98 }
99 {
100 passiveInfo = defaultPassiveInfo;
101 service_description = "No mdadm array is degraded";
102 use = "local-service";
103 check_command = [
104 "check_command_output"
105 "${mdadm}/bin/mdadm --monitor --scan -1"
106 "^$"
107 "-s 0 -r root"
108 ];
109 }
110 ];
111 command = {
112 check_dns = "$USER1$/check_dns -H $ARG1$ -s $HOSTADDRESS$ $ARG2$";
113 check_emails = "$USER2$/check_emails -H $HOSTADDRESS$ -i $USER203$ -l $ARG1$ -p $ARG2$ -s $ARG3$ -f $ARG4$";
114 check_emails_local = "$USER2$/check_emails -H $HOSTADDRESS$ -n $ARG1$ -r $ADMINEMAIL$ -s $ARG2$ -f $ARG3$";
115 check_backup_eriomem = "$USER2$/check_eriomem $USER208$";
116 check_backup_eriomem_age = "$USER2$/check_backup_eriomem_age $ARG1$";
117 check_backup_ovh_age = "$USER2$/check_backup_ovh_age $ARG1$";
118 check_external_dns = "$USER1$/check_dns -H $ARG2$ -s $ARG1$ $ARG3$";
119 check_ftp_database = "$USER2$/check_ftp_database";
120 check_git = "$USER2$/check_git $USER203$";
121 check_http = "$USER1$/check_http --sni -f stickyport -H \"$ARG1$\" -u \"$ARG2$\" -r \"$ARG3$\"";
122 check_https = "$USER1$/check_http --sni --ssl -f stickyport -H \"$ARG1$\" -u \"$ARG2$\" -r \"$ARG3$\"";
123 check_https_auth = "$USER1$/check_http --sni --ssl -a \"$USER202$\" -f stickyport -H \"$ARG1$\" -u \"$ARG2$\" -r \"$ARG3$\"";
124 check_https_certificate = "$USER1$/check_http --sni --ssl -H \"$ARG1$\" -C 21,15";
125 check_https_code = "$USER1$/check_http --sni --ssl -f stickyport -H \"$ARG1$\" -u \"$ARG2$\" -e \"$ARG3$\" -r \"$ARG4$\"";
126 check_imap_connection = "$USER2$/check_imap_connection -u \"$USER204$\" -p \"$USER205$\" -H \"imap.immae.eu:143\"";
127 check_local_disk = "$USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$";
128 check_local_procs = "$USER1$/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$";
129 check_local_load = "$USER1$/check_load -w $ARG1$ -c $ARG2$";
130 check_local_swap = "$USER1$/check_swap -n ok -w $ARG1$ -c $ARG2$";
131 check_local_bandwidth = "$USER2$/check_bandwidth -i=$ARG1$ -w $ARG2$ -c $ARG3$";
132 check_memory = "$USER2$/check_mem.sh -w $ARG1$ -c $ARG2$";
133 check_command_match = "$USER2$/check_command -c \"$ARG1$\" -C \"$ARG2$\" $ARG3$";
134 check_command_output = "$USER2$/check_command -c \"$ARG1$\" -s 0 -o \"$ARG2$\" $ARG3$";
135 check_command_status = "$USER2$/check_command -c \"$ARG1$\" -s \"$ARG2$\" $ARG3$";
136 check_ntp = "$USER1$/check_ntp_time -t 30 -q -H 0.arch.pool.ntp.org";
137 check_mailq = "$USER1$/check_mailq -s -w 1 -c 2";
138 check_mysql_replication = "${sudo} -u mysql $USER2$/check_mysql_replication \"$ARG1$\" \"$ARG2$\"";
139 check_postgresql_replication = "${sudo} -u postgres $USER2$/check_postgres_replication \"$ARG1$\" \"$ARG2$\" \"$ARG3$\"";
140 check_openldap_replication = "${sudo} -u openldap $USER2$/check_openldap_replication \"$ARG1$\" \"$ARG2$\" \"$ARG3$\" \"$ARG4$\" \"$ARG5$\"";
141 check_ovh_sms = "$USER2$/check_ovh_sms \"$USER209$\"";
142 check_redis_replication = "${sudo} -u redis $USER2$/check_redis_replication \"$ARG1$\"";
143 check_smtp = "$USER1$/check_smtp -H $HOSTADDRESS$ -p 25 -S -D 21,15";
144 check_tcp = "$USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ -e \"$ARG2$\" -Mcrit";
145 check_tcp_ssl = "$USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ -S -D 21,15";
146
147 check_host_alive = "$USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5";
148 check_last_file_date = "${sudo} -u \"$ARG3$\" $USER2$/check_last_file_date \"$ARG1$\" \"$ARG2$\"";
149 check_ok = "$USER1$/check_dummy 0 \"Dummy OK\"";
150 check_critical = "$USER1$/check_dummy 2 \"Dummy CRITICAL\"";
151
152 # $OVE is to force naemon to run via shell instead of execve which fails here
153 notify-host-by-email = "ADMINEMAIL=\"$ADMINEMAIL$\" SERVICENOTIFICATIONID=\"$SERVICENOTIFICATIONID$\" HOSTSTATE=\"$HOSTSTATE$\" HOSTOUTPUT=\"$HOSTOUTPUT$\" $USER2$/notify_by_email host \"$NOTIFICATIONTYPE$\" \"$HOSTALIAS$\" \"$LONGDATETIME$\" \"$CONTACTEMAIL$\" $OVE";
154 # $OVE is to force naemon to run via shell instead of execve which fails here
155 notify-service-by-email = "ADMINEMAIL=\"$ADMINEMAIL$\" SERVICENOTIFICATIONID=\"$SERVICENOTIFICATIONID$\" SERVICEDESC=\"$SERVICEDESC$\" SERVICESTATE=\"$SERVICESTATE$\" SERVICEOUTPUT=\"$SERVICEOUTPUT$\" $USER2$/notify_by_email service \"$NOTIFICATIONTYPE$\" \"$HOSTALIAS$\" \"$LONGDATETIME$\" \"$CONTACTEMAIL$\" $OVE";
156 notify-by-slack = "HOST=\"$HOSTALIAS$\" SERVICESTATE=\"$SERVICESTATE$\" SERVICEDESC=\"$SERVICEDESC$\" SERVICEOUTPUT=\"$SERVICEOUTPUT$\" $USER2$/notify_by_slack \"$ARG1$\" \"$ARG2$\"";
157
158 notify-master = "$USER2$/send_nrdp.sh -u \"$USER200$\" -t \"$USER201$\" -H \"$HOSTADDRESS$\" -s \"$SERVICEDESC$\" -S \"$SERVICESTATEID$\" -o \"$SERVICEOUTPUT$ | $SERVICEPERFDATA$\"";
159 };
160 timeperiod = {
161 "24x7" = {
162 alias = "24 Hours A Day, 7 Days A Week";
163 monday = "00:00-24:00";
164 tuesday = "00:00-24:00";
165 wednesday = "00:00-24:00";
166 thursday = "00:00-24:00";
167 friday = "00:00-24:00";
168 saturday = "00:00-24:00";
169 sunday = "00:00-24:00";
170 };
171 };
172 servicegroup = {
173 webstatus-webapps = { alias = "Web applications"; };
174 webstatus-websites = { alias = "Personal websites"; };
175 webstatus-ssl = { alias = "SSL certificates"; };
176 webstatus-dns = { alias = "DNS resolution"; };
177 webstatus-remote-services = { alias = "Other remote services"; };
178 webstatus-local-services = { alias = "Other local services"; };
179 webstatus-email = { alias = "E-mail services"; };
180 webstatus-resources = { alias = "Local resources"; };
181 webstatus-databases = { alias = "Databases resources"; };
182 webstatus-backup = { alias = "Backup resources"; };
183 };
184 hostgroup = {
185 webstatus-hosts = { alias = "Hosts"; };
186 };
187 contactgroup = {
188 admins = { alias = "Naemon Administrators"; };
189 };
190 templates = {
191 service = {
192 generic-service = {
193 active_checks_enabled = "1";
194 check_freshness = "0";
195 check_interval = "10";
196 check_period = "24x7";
197 contact_groups = "admins";
198 event_handler_enabled = "1";
199 flap_detection_enabled = "1";
200 is_volatile = "0";
201 max_check_attempts = "3";
202 notification_interval = "60";
203 notification_options = "w,u,c,r,f,s";
204 notification_period = "24x7";
205 notifications_enabled = if master then "1" else "0";
206 obsess_over_service = "1";
207 passive_checks_enabled = "1";
208 process_perf_data = "1";
209 retain_nonstatus_information = "1";
210 retain_status_information = "1";
211 retry_interval = "2";
212 };
213 local-service = {
214 use = "generic-service";
215 host_name = hostFQDN;
216 check_interval = "5";
217 max_check_attempts = "4";
218 retry_interval = "1";
219 servicegroups = "webstatus-resources";
220 };
221 external-service = {
222 use = "generic-service";
223 check_interval = "5";
224 max_check_attempts = "4";
225 retry_interval = "1";
226 };
227 web-service = {
228 use = "generic-service";
229 check_interval = "20";
230 max_check_attempts = "2";
231 retry_interval = "1";
232 };
233 external-web-service = {
234 use = "generic-service";
235 check_interval = "20";
236 max_check_attempts = "2";
237 retry_interval = "1";
238 };
239 mail-service = {
240 use = "generic-service";
241 check_interval = "15";
242 max_check_attempts = "1";
243 retry_interval = "1";
244 };
245 dns-service = {
246 use = "generic-service";
247 check_interval = "120";
248 notification_interval = "120";
249 max_check_attempts = "5";
250 retry_interval = "5";
251 };
252 };
253 # No contact, we go through master
254 contact = {
255 generic-contact = {
256 host_notification_commands = "notify-host-by-email";
257 host_notification_options = "d,u,r,f,s";
258 host_notification_period = "24x7";
259 service_notification_commands = "notify-service-by-email";
260 service_notification_options = "w,u,c,r,f,s";
261 service_notification_period = "24x7";
262 };
263 };
264 host = {
265 generic-host = {
266 event_handler_enabled = "1";
267 flap_detection_enabled = "1";
268 notification_period = "24x7";
269 notifications_enabled = "1";
270 process_perf_data = "1";
271 retain_nonstatus_information = "1";
272 retain_status_information = "1";
273 };
274 linux-server = {
275 check_command = "check_host_alive";
276 check_interval = "5";
277 check_period = "24x7";
278 contact_groups = "admins";
279 max_check_attempts = "10";
280 notification_interval = "120";
281 notification_options = "d,u,r,f";
282 retry_interval = "1";
283 };
284 };
285 };
286 }