mox/prometheus.rules
Mechiel Lukkien afb182cb14
smtpserver: add prometheus metric for failing starttls handshakes for incoming deliveries
and add an alerting rule if the failure rate becomes >10% (e.g. expired
certificate).

the prometheus metrics includes a reason, including potential tls alerts, if
remote smtp clients would send those (openssl s_client -starttls does).

inspired by issue #237, where incoming connections were aborted by remote. such
errors would show up as "eof" in the metrics.
2024-11-29 12:43:21 +01:00

94 lines
3.4 KiB
Text

# example prometheus alerting rules file for mox.
groups:
- name: mox
rules:
- alert: mox-panic
expr: increase(mox_panic_total[1h]) > 0
annotations:
summary: unhandled panic
- alert: mox-ip-on-dns-blocklist
expr: mox_dnsbl_ips_success < 1
annotations:
summary: ip is on dns blocklist
- alert: mox-queue-failing-delivery
expr: increase(mox_queue_delivery_duration_seconds_count{attempt!~"[123]",result!="ok"}[1h]) > 0
annotations:
summary: delivery from queue had a 4th or later attempt fail
- alert: mox-smtpserver-errors
expr: increase(mox_smtpserver_errors_total[1h]) > 0
annotations:
summary: errors in smtpserver operation
- alert: mox-webserver-errors
expr: increase(mox_httpserver_request_duration_seconds_count{code=~"5.."}[1h]) > 0
annotations:
summary: http 5xx responses from webserver
- alert: mox-queue-hold
expr: mox_queue_hold > 0
for: 2h
annotations:
summary: messages on hold in queue for at least two hours
- alert: mox-submission-errors
expr: increase(mox_smtpserver_submission_total{result=~".*error"}[1h]) > 0
annotations:
summary: smtp submission errors
- alert: mox-delivery-errors
expr: increase(mox_smtpserver_delivery_total{result=~".*error"}[1h]) > 0
annotations:
summary: smtp delivery errors
- alert: mox-webmail-errors
expr: increase(mox_webmail_errors_total[1h]) > 0
annotations:
summary: errors in webmail operation
- alert: mox-webmailsubmission-errors
expr: increase(mox_webmail_submission_total{result=~".*error"}[1h]) > 0
annotations:
summary: webmail submission errors
- alert: mox-sherpa-server-errors
expr: increase(sherpa_errors_total{api=~"mox.*",code=~"server:.*"}[1h]) > 0
annotations:
summary: sherpa web api server errors
# the alerts below can be used to keep a closer eye or when starting to use mox,
# but can be noisy, or you may not be able to prevent them.
- alert: mox-incoming-delivery-starttls-errors
expr: sum by (instance) (increase(mox_smtpserver_delivery_starttls_errors_total[1h])) / sum by (instance) (increase(mox_smtpserver_delivery_starttls_total[1h])) > 0.1
annotations:
summary: starttls handshake errors for >10% of incoming smtp delivery connections
# change period to match your expected incoming message rate.
- alert: mox-no-deliveries
expr: sum by (instance) (rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
annotations:
summary: no mail delivered for 6 hours
# may be noisy. anyone can send these reports. you may want to silence it.
- alert: mox-tlsrpt-errors
expr: increase(mox_tlsrptdb_session_total{type!="success"}[1h]) > 0
annotations:
summary: tls reports about unsuccessful tls connections
# may be noisy. can be caused by someone trying to send email as you. and
# anyone can send these reports. you are not in control over when this fires,
# so you may want to silence it.
- alert: mox-dmarc-rejects
expr: increase(mox_dmarcdb_policy_evaluated_total{disposition!="none"}[1h]) > 0
annotations:
summary: dmarc reports about rejects/quarantines due to failing dmarc check
# may be noisy
- alert: mox-auth-ratelimited
expr: increase(mox_authentication_ratelimited_total[1h]) > 0
annotations:
summary: authentication connections/requests were rate limited