From afb182cb14def32987f11200539806af83bb30c5 Mon Sep 17 00:00:00 2001 From: Mechiel Lukkien Date: Fri, 29 Nov 2024 12:43:21 +0100 Subject: [PATCH] smtpserver: add prometheus metric for failing starttls handshakes for incoming deliveries and add an alerting rule if the failure rate becomes >10% (e.g. expired certificate). the prometheus metrics includes a reason, including potential tls alerts, if remote smtp clients would send those (openssl s_client -starttls does). inspired by issue #237, where incoming connections were aborted by remote. such errors would show up as "eof" in the metrics. --- prometheus.rules | 7 +++++- smtpserver/server.go | 51 +++++++++++++++++++++++++++++++++++++++++++ tlsrpt/alert.go | 3 ++- tlsrpt/alert_go120.go | 3 ++- tlsrpt/report.go | 4 ++-- 5 files changed, 63 insertions(+), 5 deletions(-) diff --git a/prometheus.rules b/prometheus.rules index 1f75074..b134935 100644 --- a/prometheus.rules +++ b/prometheus.rules @@ -62,9 +62,14 @@ groups: # the alerts below can be used to keep a closer eye or when starting to use mox, # but can be noisy, or you may not be able to prevent them. + - alert: mox-incoming-delivery-starttls-errors + expr: sum by (instance) (increase(mox_smtpserver_delivery_starttls_errors_total[1h])) / sum by (instance) (increase(mox_smtpserver_delivery_starttls_total[1h])) > 0.1 + annotations: + summary: starttls handshake errors for >10% of incoming smtp delivery connections + # change period to match your expected incoming message rate. - alert: mox-no-deliveries - expr: sum(rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0 + expr: sum by (instance) (rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0 annotations: summary: no mail delivered for 6 hours diff --git a/smtpserver/server.go b/smtpserver/server.go index dd87ec7..6e11266 100644 --- a/smtpserver/server.go +++ b/smtpserver/server.go @@ -22,6 +22,7 @@ import ( "net" "net/textproto" "os" + "reflect" "runtime/debug" "slices" "sort" @@ -59,6 +60,7 @@ import ( "github.com/mjl-/mox/smtp" "github.com/mjl-/mox/spf" "github.com/mjl-/mox/store" + "github.com/mjl-/mox/tlsrpt" "github.com/mjl-/mox/tlsrptdb" ) @@ -171,6 +173,21 @@ var ( "error", }, ) + metricDeliveryStarttls = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "mox_smtpserver_delivery_starttls_total", + Help: "Total number of STARTTLS handshakes for incoming deliveries.", + }, + ) + metricDeliveryStarttlsErrors = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "mox_smtpserver_delivery_starttls_errors_total", + Help: "Errors with TLS handshake during STARTTLS for incoming deliveries.", + }, + []string{ + "reason", // "eof", "sslv2", "unsupportedversions", "nottls", "alert--", "other" + }, + ) ) var jitterRand = mox.NewPseudoRand() @@ -955,7 +972,25 @@ func (c *conn) cmdStarttls(p *parser) { ctx, cancel := context.WithTimeout(cidctx, time.Minute) defer cancel() c.log.Debug("starting tls server handshake") + metricDeliveryStarttls.Inc() if err := tlsConn.HandshakeContext(ctx); err != nil { + // Errors from crypto/tls mostly aren't typed. We'll have to look for strings... + reason := "other" + if errors.Is(err, io.EOF) { + reason = "eof" + } else if alert, ok := asTLSAlert(err); ok { + reason = tlsrpt.FormatAlert(alert) + } else { + s := err.Error() + if strings.Contains(s, "tls: client offered only unsupported versions") { + reason = "unsupportedversions" + } else if strings.Contains(s, "tls: first record does not look like a TLS handshake") { + reason = "nottls" + } else if strings.Contains(s, "tls: unsupported SSLv2 handshake received") { + reason = "sslv2" + } + } + metricDeliveryStarttlsErrors.WithLabelValues(reason).Inc() panic(fmt.Errorf("starttls handshake: %s (%w)", err, errIO)) } cancel() @@ -971,6 +1006,22 @@ func (c *conn) cmdStarttls(p *parser) { c.tls = true } +func asTLSAlert(err error) (alert uint8, ok bool) { + // If the remote client aborts the connection, it can send an alert indicating why. + // crypto/tls gives us a net.OpError with "Op" set to "remote error", an an Err + // with the unexported type "alert", a uint8. So we try to read it. + + var opErr *net.OpError + if !errors.As(err, &opErr) || opErr.Op != "remote error" || opErr.Err == nil { + return + } + v := reflect.ValueOf(opErr.Err) + if v.Kind() != reflect.Uint8 || v.Type().Name() != "alert" { + return + } + return uint8(v.Uint()), true +} + // ../rfc/4954:139 func (c *conn) cmdAuth(p *parser) { c.xneedHello() diff --git a/tlsrpt/alert.go b/tlsrpt/alert.go index 3f4ca8b..227bb2a 100644 --- a/tlsrpt/alert.go +++ b/tlsrpt/alert.go @@ -10,7 +10,8 @@ import ( "strings" ) -func formatAlert(alert uint8) string { +// FormatAlert formats a TLS alert in the form "alert-" or "alert--". +func FormatAlert(alert uint8) string { s := fmt.Sprintf("alert-%d", alert) err := tls.AlertError(alert) // Since go1.21.0 // crypto/tls returns messages like "tls: short message" or "tls: alert(321)". diff --git a/tlsrpt/alert_go120.go b/tlsrpt/alert_go120.go index c686a52..a2a396a 100644 --- a/tlsrpt/alert_go120.go +++ b/tlsrpt/alert_go120.go @@ -8,6 +8,7 @@ import ( "fmt" ) -func formatAlert(alert uint8) string { +// FormatAlert formats a TLS alert in the form "alert-". +func FormatAlert(alert uint8) string { return fmt.Sprintf("alert-%d", alert) } diff --git a/tlsrpt/report.go b/tlsrpt/report.go index 5e09d33..a25cc37 100644 --- a/tlsrpt/report.go +++ b/tlsrpt/report.go @@ -394,7 +394,7 @@ func TLSFailureDetails(err error) (ResultType, string) { // todo: ideally, crypto/tls would let us check if this is an alert. it could be another uint8-typed error. v := reflect.ValueOf(netErr.Err) if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" { - reasonCode = "tls-remote-" + formatAlert(uint8(v.Uint())) + reasonCode = "tls-remote-" + FormatAlert(uint8(v.Uint())) } } return ResultValidationFailure, reasonCode @@ -429,7 +429,7 @@ func TLSFailureDetails(err error) (ResultType, string) { } v := reflect.ValueOf(err) if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" { - reasonCode = "tls-local-" + formatAlert(uint8(v.Uint())) + reasonCode = "tls-local-" + FormatAlert(uint8(v.Uint())) } } return ResultValidationFailure, reasonCode