mirror of
https://github.com/mjl-/mox.git
synced 2024-12-25 16:03:48 +03:00
afb182cb14
and add an alerting rule if the failure rate becomes >10% (e.g. expired certificate). the prometheus metrics includes a reason, including potential tls alerts, if remote smtp clients would send those (openssl s_client -starttls does). inspired by issue #237, where incoming connections were aborted by remote. such errors would show up as "eof" in the metrics.
94 lines
3.4 KiB
Text
94 lines
3.4 KiB
Text
# example prometheus alerting rules file for mox.
|
|
|
|
groups:
|
|
- name: mox
|
|
rules:
|
|
- alert: mox-panic
|
|
expr: increase(mox_panic_total[1h]) > 0
|
|
annotations:
|
|
summary: unhandled panic
|
|
|
|
- alert: mox-ip-on-dns-blocklist
|
|
expr: mox_dnsbl_ips_success < 1
|
|
annotations:
|
|
summary: ip is on dns blocklist
|
|
|
|
- alert: mox-queue-failing-delivery
|
|
expr: increase(mox_queue_delivery_duration_seconds_count{attempt!~"[123]",result!="ok"}[1h]) > 0
|
|
annotations:
|
|
summary: delivery from queue had a 4th or later attempt fail
|
|
|
|
- alert: mox-smtpserver-errors
|
|
expr: increase(mox_smtpserver_errors_total[1h]) > 0
|
|
annotations:
|
|
summary: errors in smtpserver operation
|
|
|
|
- alert: mox-webserver-errors
|
|
expr: increase(mox_httpserver_request_duration_seconds_count{code=~"5.."}[1h]) > 0
|
|
annotations:
|
|
summary: http 5xx responses from webserver
|
|
|
|
- alert: mox-queue-hold
|
|
expr: mox_queue_hold > 0
|
|
for: 2h
|
|
annotations:
|
|
summary: messages on hold in queue for at least two hours
|
|
|
|
- alert: mox-submission-errors
|
|
expr: increase(mox_smtpserver_submission_total{result=~".*error"}[1h]) > 0
|
|
annotations:
|
|
summary: smtp submission errors
|
|
|
|
- alert: mox-delivery-errors
|
|
expr: increase(mox_smtpserver_delivery_total{result=~".*error"}[1h]) > 0
|
|
annotations:
|
|
summary: smtp delivery errors
|
|
|
|
- alert: mox-webmail-errors
|
|
expr: increase(mox_webmail_errors_total[1h]) > 0
|
|
annotations:
|
|
summary: errors in webmail operation
|
|
|
|
- alert: mox-webmailsubmission-errors
|
|
expr: increase(mox_webmail_submission_total{result=~".*error"}[1h]) > 0
|
|
annotations:
|
|
summary: webmail submission errors
|
|
|
|
- alert: mox-sherpa-server-errors
|
|
expr: increase(sherpa_errors_total{api=~"mox.*",code=~"server:.*"}[1h]) > 0
|
|
annotations:
|
|
summary: sherpa web api server errors
|
|
|
|
# the alerts below can be used to keep a closer eye or when starting to use mox,
|
|
# but can be noisy, or you may not be able to prevent them.
|
|
|
|
- alert: mox-incoming-delivery-starttls-errors
|
|
expr: sum by (instance) (increase(mox_smtpserver_delivery_starttls_errors_total[1h])) / sum by (instance) (increase(mox_smtpserver_delivery_starttls_total[1h])) > 0.1
|
|
annotations:
|
|
summary: starttls handshake errors for >10% of incoming smtp delivery connections
|
|
|
|
# change period to match your expected incoming message rate.
|
|
- alert: mox-no-deliveries
|
|
expr: sum by (instance) (rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
|
|
annotations:
|
|
summary: no mail delivered for 6 hours
|
|
|
|
# may be noisy. anyone can send these reports. you may want to silence it.
|
|
- alert: mox-tlsrpt-errors
|
|
expr: increase(mox_tlsrptdb_session_total{type!="success"}[1h]) > 0
|
|
annotations:
|
|
summary: tls reports about unsuccessful tls connections
|
|
|
|
# may be noisy. can be caused by someone trying to send email as you. and
|
|
# anyone can send these reports. you are not in control over when this fires,
|
|
# so you may want to silence it.
|
|
- alert: mox-dmarc-rejects
|
|
expr: increase(mox_dmarcdb_policy_evaluated_total{disposition!="none"}[1h]) > 0
|
|
annotations:
|
|
summary: dmarc reports about rejects/quarantines due to failing dmarc check
|
|
|
|
# may be noisy
|
|
- alert: mox-auth-ratelimited
|
|
expr: increase(mox_authentication_ratelimited_total[1h]) > 0
|
|
annotations:
|
|
summary: authentication connections/requests were rate limited
|