mirror of
https://github.com/mjl-/mox.git
synced 2024-12-25 07:53:47 +03:00
smtpserver: add prometheus metric for failing starttls handshakes for incoming deliveries
and add an alerting rule if the failure rate becomes >10% (e.g. expired certificate). the prometheus metrics includes a reason, including potential tls alerts, if remote smtp clients would send those (openssl s_client -starttls does). inspired by issue #237, where incoming connections were aborted by remote. such errors would show up as "eof" in the metrics.
This commit is contained in:
parent
09e7ddba9e
commit
afb182cb14
5 changed files with 63 additions and 5 deletions
|
@ -62,9 +62,14 @@ groups:
|
|||
# the alerts below can be used to keep a closer eye or when starting to use mox,
|
||||
# but can be noisy, or you may not be able to prevent them.
|
||||
|
||||
- alert: mox-incoming-delivery-starttls-errors
|
||||
expr: sum by (instance) (increase(mox_smtpserver_delivery_starttls_errors_total[1h])) / sum by (instance) (increase(mox_smtpserver_delivery_starttls_total[1h])) > 0.1
|
||||
annotations:
|
||||
summary: starttls handshake errors for >10% of incoming smtp delivery connections
|
||||
|
||||
# change period to match your expected incoming message rate.
|
||||
- alert: mox-no-deliveries
|
||||
expr: sum(rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
|
||||
expr: sum by (instance) (rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
|
||||
annotations:
|
||||
summary: no mail delivered for 6 hours
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import (
|
|||
"net"
|
||||
"net/textproto"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime/debug"
|
||||
"slices"
|
||||
"sort"
|
||||
|
@ -59,6 +60,7 @@ import (
|
|||
"github.com/mjl-/mox/smtp"
|
||||
"github.com/mjl-/mox/spf"
|
||||
"github.com/mjl-/mox/store"
|
||||
"github.com/mjl-/mox/tlsrpt"
|
||||
"github.com/mjl-/mox/tlsrptdb"
|
||||
)
|
||||
|
||||
|
@ -171,6 +173,21 @@ var (
|
|||
"error",
|
||||
},
|
||||
)
|
||||
metricDeliveryStarttls = promauto.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Name: "mox_smtpserver_delivery_starttls_total",
|
||||
Help: "Total number of STARTTLS handshakes for incoming deliveries.",
|
||||
},
|
||||
)
|
||||
metricDeliveryStarttlsErrors = promauto.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "mox_smtpserver_delivery_starttls_errors_total",
|
||||
Help: "Errors with TLS handshake during STARTTLS for incoming deliveries.",
|
||||
},
|
||||
[]string{
|
||||
"reason", // "eof", "sslv2", "unsupportedversions", "nottls", "alert-<num>-<msg>", "other"
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
var jitterRand = mox.NewPseudoRand()
|
||||
|
@ -955,7 +972,25 @@ func (c *conn) cmdStarttls(p *parser) {
|
|||
ctx, cancel := context.WithTimeout(cidctx, time.Minute)
|
||||
defer cancel()
|
||||
c.log.Debug("starting tls server handshake")
|
||||
metricDeliveryStarttls.Inc()
|
||||
if err := tlsConn.HandshakeContext(ctx); err != nil {
|
||||
// Errors from crypto/tls mostly aren't typed. We'll have to look for strings...
|
||||
reason := "other"
|
||||
if errors.Is(err, io.EOF) {
|
||||
reason = "eof"
|
||||
} else if alert, ok := asTLSAlert(err); ok {
|
||||
reason = tlsrpt.FormatAlert(alert)
|
||||
} else {
|
||||
s := err.Error()
|
||||
if strings.Contains(s, "tls: client offered only unsupported versions") {
|
||||
reason = "unsupportedversions"
|
||||
} else if strings.Contains(s, "tls: first record does not look like a TLS handshake") {
|
||||
reason = "nottls"
|
||||
} else if strings.Contains(s, "tls: unsupported SSLv2 handshake received") {
|
||||
reason = "sslv2"
|
||||
}
|
||||
}
|
||||
metricDeliveryStarttlsErrors.WithLabelValues(reason).Inc()
|
||||
panic(fmt.Errorf("starttls handshake: %s (%w)", err, errIO))
|
||||
}
|
||||
cancel()
|
||||
|
@ -971,6 +1006,22 @@ func (c *conn) cmdStarttls(p *parser) {
|
|||
c.tls = true
|
||||
}
|
||||
|
||||
func asTLSAlert(err error) (alert uint8, ok bool) {
|
||||
// If the remote client aborts the connection, it can send an alert indicating why.
|
||||
// crypto/tls gives us a net.OpError with "Op" set to "remote error", an an Err
|
||||
// with the unexported type "alert", a uint8. So we try to read it.
|
||||
|
||||
var opErr *net.OpError
|
||||
if !errors.As(err, &opErr) || opErr.Op != "remote error" || opErr.Err == nil {
|
||||
return
|
||||
}
|
||||
v := reflect.ValueOf(opErr.Err)
|
||||
if v.Kind() != reflect.Uint8 || v.Type().Name() != "alert" {
|
||||
return
|
||||
}
|
||||
return uint8(v.Uint()), true
|
||||
}
|
||||
|
||||
// ../rfc/4954:139
|
||||
func (c *conn) cmdAuth(p *parser) {
|
||||
c.xneedHello()
|
||||
|
|
|
@ -10,7 +10,8 @@ import (
|
|||
"strings"
|
||||
)
|
||||
|
||||
func formatAlert(alert uint8) string {
|
||||
// FormatAlert formats a TLS alert in the form "alert-<num>" or "alert-<num>-<shortcode>".
|
||||
func FormatAlert(alert uint8) string {
|
||||
s := fmt.Sprintf("alert-%d", alert)
|
||||
err := tls.AlertError(alert) // Since go1.21.0
|
||||
// crypto/tls returns messages like "tls: short message" or "tls: alert(321)".
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"fmt"
|
||||
)
|
||||
|
||||
func formatAlert(alert uint8) string {
|
||||
// FormatAlert formats a TLS alert in the form "alert-<num>".
|
||||
func FormatAlert(alert uint8) string {
|
||||
return fmt.Sprintf("alert-%d", alert)
|
||||
}
|
||||
|
|
|
@ -394,7 +394,7 @@ func TLSFailureDetails(err error) (ResultType, string) {
|
|||
// todo: ideally, crypto/tls would let us check if this is an alert. it could be another uint8-typed error.
|
||||
v := reflect.ValueOf(netErr.Err)
|
||||
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
|
||||
reasonCode = "tls-remote-" + formatAlert(uint8(v.Uint()))
|
||||
reasonCode = "tls-remote-" + FormatAlert(uint8(v.Uint()))
|
||||
}
|
||||
}
|
||||
return ResultValidationFailure, reasonCode
|
||||
|
@ -429,7 +429,7 @@ func TLSFailureDetails(err error) (ResultType, string) {
|
|||
}
|
||||
v := reflect.ValueOf(err)
|
||||
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
|
||||
reasonCode = "tls-local-" + formatAlert(uint8(v.Uint()))
|
||||
reasonCode = "tls-local-" + FormatAlert(uint8(v.Uint()))
|
||||
}
|
||||
}
|
||||
return ResultValidationFailure, reasonCode
|
||||
|
|
Loading…
Reference in a new issue