smtpserver: add prometheus metric for failing starttls handshakes for incoming deliveries

and add an alerting rule if the failure rate becomes >10% (e.g. expired
certificate).

the prometheus metrics includes a reason, including potential tls alerts, if
remote smtp clients would send those (openssl s_client -starttls does).

inspired by issue #237, where incoming connections were aborted by remote. such
errors would show up as "eof" in the metrics.
This commit is contained in:
Mechiel Lukkien 2024-11-29 12:43:21 +01:00
parent 09e7ddba9e
commit afb182cb14
No known key found for this signature in database
5 changed files with 63 additions and 5 deletions

View file

@ -62,9 +62,14 @@ groups:
# the alerts below can be used to keep a closer eye or when starting to use mox, # the alerts below can be used to keep a closer eye or when starting to use mox,
# but can be noisy, or you may not be able to prevent them. # but can be noisy, or you may not be able to prevent them.
- alert: mox-incoming-delivery-starttls-errors
expr: sum by (instance) (increase(mox_smtpserver_delivery_starttls_errors_total[1h])) / sum by (instance) (increase(mox_smtpserver_delivery_starttls_total[1h])) > 0.1
annotations:
summary: starttls handshake errors for >10% of incoming smtp delivery connections
# change period to match your expected incoming message rate. # change period to match your expected incoming message rate.
- alert: mox-no-deliveries - alert: mox-no-deliveries
expr: sum(rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0 expr: sum by (instance) (rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
annotations: annotations:
summary: no mail delivered for 6 hours summary: no mail delivered for 6 hours

View file

@ -22,6 +22,7 @@ import (
"net" "net"
"net/textproto" "net/textproto"
"os" "os"
"reflect"
"runtime/debug" "runtime/debug"
"slices" "slices"
"sort" "sort"
@ -59,6 +60,7 @@ import (
"github.com/mjl-/mox/smtp" "github.com/mjl-/mox/smtp"
"github.com/mjl-/mox/spf" "github.com/mjl-/mox/spf"
"github.com/mjl-/mox/store" "github.com/mjl-/mox/store"
"github.com/mjl-/mox/tlsrpt"
"github.com/mjl-/mox/tlsrptdb" "github.com/mjl-/mox/tlsrptdb"
) )
@ -171,6 +173,21 @@ var (
"error", "error",
}, },
) )
metricDeliveryStarttls = promauto.NewCounter(
prometheus.CounterOpts{
Name: "mox_smtpserver_delivery_starttls_total",
Help: "Total number of STARTTLS handshakes for incoming deliveries.",
},
)
metricDeliveryStarttlsErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "mox_smtpserver_delivery_starttls_errors_total",
Help: "Errors with TLS handshake during STARTTLS for incoming deliveries.",
},
[]string{
"reason", // "eof", "sslv2", "unsupportedversions", "nottls", "alert-<num>-<msg>", "other"
},
)
) )
var jitterRand = mox.NewPseudoRand() var jitterRand = mox.NewPseudoRand()
@ -955,7 +972,25 @@ func (c *conn) cmdStarttls(p *parser) {
ctx, cancel := context.WithTimeout(cidctx, time.Minute) ctx, cancel := context.WithTimeout(cidctx, time.Minute)
defer cancel() defer cancel()
c.log.Debug("starting tls server handshake") c.log.Debug("starting tls server handshake")
metricDeliveryStarttls.Inc()
if err := tlsConn.HandshakeContext(ctx); err != nil { if err := tlsConn.HandshakeContext(ctx); err != nil {
// Errors from crypto/tls mostly aren't typed. We'll have to look for strings...
reason := "other"
if errors.Is(err, io.EOF) {
reason = "eof"
} else if alert, ok := asTLSAlert(err); ok {
reason = tlsrpt.FormatAlert(alert)
} else {
s := err.Error()
if strings.Contains(s, "tls: client offered only unsupported versions") {
reason = "unsupportedversions"
} else if strings.Contains(s, "tls: first record does not look like a TLS handshake") {
reason = "nottls"
} else if strings.Contains(s, "tls: unsupported SSLv2 handshake received") {
reason = "sslv2"
}
}
metricDeliveryStarttlsErrors.WithLabelValues(reason).Inc()
panic(fmt.Errorf("starttls handshake: %s (%w)", err, errIO)) panic(fmt.Errorf("starttls handshake: %s (%w)", err, errIO))
} }
cancel() cancel()
@ -971,6 +1006,22 @@ func (c *conn) cmdStarttls(p *parser) {
c.tls = true c.tls = true
} }
func asTLSAlert(err error) (alert uint8, ok bool) {
// If the remote client aborts the connection, it can send an alert indicating why.
// crypto/tls gives us a net.OpError with "Op" set to "remote error", an an Err
// with the unexported type "alert", a uint8. So we try to read it.
var opErr *net.OpError
if !errors.As(err, &opErr) || opErr.Op != "remote error" || opErr.Err == nil {
return
}
v := reflect.ValueOf(opErr.Err)
if v.Kind() != reflect.Uint8 || v.Type().Name() != "alert" {
return
}
return uint8(v.Uint()), true
}
// ../rfc/4954:139 // ../rfc/4954:139
func (c *conn) cmdAuth(p *parser) { func (c *conn) cmdAuth(p *parser) {
c.xneedHello() c.xneedHello()

View file

@ -10,7 +10,8 @@ import (
"strings" "strings"
) )
func formatAlert(alert uint8) string { // FormatAlert formats a TLS alert in the form "alert-<num>" or "alert-<num>-<shortcode>".
func FormatAlert(alert uint8) string {
s := fmt.Sprintf("alert-%d", alert) s := fmt.Sprintf("alert-%d", alert)
err := tls.AlertError(alert) // Since go1.21.0 err := tls.AlertError(alert) // Since go1.21.0
// crypto/tls returns messages like "tls: short message" or "tls: alert(321)". // crypto/tls returns messages like "tls: short message" or "tls: alert(321)".

View file

@ -8,6 +8,7 @@ import (
"fmt" "fmt"
) )
func formatAlert(alert uint8) string { // FormatAlert formats a TLS alert in the form "alert-<num>".
func FormatAlert(alert uint8) string {
return fmt.Sprintf("alert-%d", alert) return fmt.Sprintf("alert-%d", alert)
} }

View file

@ -394,7 +394,7 @@ func TLSFailureDetails(err error) (ResultType, string) {
// todo: ideally, crypto/tls would let us check if this is an alert. it could be another uint8-typed error. // todo: ideally, crypto/tls would let us check if this is an alert. it could be another uint8-typed error.
v := reflect.ValueOf(netErr.Err) v := reflect.ValueOf(netErr.Err)
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" { if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
reasonCode = "tls-remote-" + formatAlert(uint8(v.Uint())) reasonCode = "tls-remote-" + FormatAlert(uint8(v.Uint()))
} }
} }
return ResultValidationFailure, reasonCode return ResultValidationFailure, reasonCode
@ -429,7 +429,7 @@ func TLSFailureDetails(err error) (ResultType, string) {
} }
v := reflect.ValueOf(err) v := reflect.ValueOf(err)
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" { if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
reasonCode = "tls-local-" + formatAlert(uint8(v.Uint())) reasonCode = "tls-local-" + FormatAlert(uint8(v.Uint()))
} }
} }
return ResultValidationFailure, reasonCode return ResultValidationFailure, reasonCode