mirror of
https://github.com/mjl-/mox.git
synced 2024-12-25 16:03:48 +03:00
smtpserver: add prometheus metric for failing starttls handshakes for incoming deliveries
and add an alerting rule if the failure rate becomes >10% (e.g. expired certificate). the prometheus metrics includes a reason, including potential tls alerts, if remote smtp clients would send those (openssl s_client -starttls does). inspired by issue #237, where incoming connections were aborted by remote. such errors would show up as "eof" in the metrics.
This commit is contained in:
parent
09e7ddba9e
commit
afb182cb14
5 changed files with 63 additions and 5 deletions
|
@ -62,9 +62,14 @@ groups:
|
||||||
# the alerts below can be used to keep a closer eye or when starting to use mox,
|
# the alerts below can be used to keep a closer eye or when starting to use mox,
|
||||||
# but can be noisy, or you may not be able to prevent them.
|
# but can be noisy, or you may not be able to prevent them.
|
||||||
|
|
||||||
|
- alert: mox-incoming-delivery-starttls-errors
|
||||||
|
expr: sum by (instance) (increase(mox_smtpserver_delivery_starttls_errors_total[1h])) / sum by (instance) (increase(mox_smtpserver_delivery_starttls_total[1h])) > 0.1
|
||||||
|
annotations:
|
||||||
|
summary: starttls handshake errors for >10% of incoming smtp delivery connections
|
||||||
|
|
||||||
# change period to match your expected incoming message rate.
|
# change period to match your expected incoming message rate.
|
||||||
- alert: mox-no-deliveries
|
- alert: mox-no-deliveries
|
||||||
expr: sum(rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
|
expr: sum by (instance) (rate(mox_smtpserver_delivery_total{result="delivered"}[6h])) == 0
|
||||||
annotations:
|
annotations:
|
||||||
summary: no mail delivered for 6 hours
|
summary: no mail delivered for 6 hours
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ import (
|
||||||
"net"
|
"net"
|
||||||
"net/textproto"
|
"net/textproto"
|
||||||
"os"
|
"os"
|
||||||
|
"reflect"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
|
@ -59,6 +60,7 @@ import (
|
||||||
"github.com/mjl-/mox/smtp"
|
"github.com/mjl-/mox/smtp"
|
||||||
"github.com/mjl-/mox/spf"
|
"github.com/mjl-/mox/spf"
|
||||||
"github.com/mjl-/mox/store"
|
"github.com/mjl-/mox/store"
|
||||||
|
"github.com/mjl-/mox/tlsrpt"
|
||||||
"github.com/mjl-/mox/tlsrptdb"
|
"github.com/mjl-/mox/tlsrptdb"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -171,6 +173,21 @@ var (
|
||||||
"error",
|
"error",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
metricDeliveryStarttls = promauto.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "mox_smtpserver_delivery_starttls_total",
|
||||||
|
Help: "Total number of STARTTLS handshakes for incoming deliveries.",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
metricDeliveryStarttlsErrors = promauto.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "mox_smtpserver_delivery_starttls_errors_total",
|
||||||
|
Help: "Errors with TLS handshake during STARTTLS for incoming deliveries.",
|
||||||
|
},
|
||||||
|
[]string{
|
||||||
|
"reason", // "eof", "sslv2", "unsupportedversions", "nottls", "alert-<num>-<msg>", "other"
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
var jitterRand = mox.NewPseudoRand()
|
var jitterRand = mox.NewPseudoRand()
|
||||||
|
@ -955,7 +972,25 @@ func (c *conn) cmdStarttls(p *parser) {
|
||||||
ctx, cancel := context.WithTimeout(cidctx, time.Minute)
|
ctx, cancel := context.WithTimeout(cidctx, time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
c.log.Debug("starting tls server handshake")
|
c.log.Debug("starting tls server handshake")
|
||||||
|
metricDeliveryStarttls.Inc()
|
||||||
if err := tlsConn.HandshakeContext(ctx); err != nil {
|
if err := tlsConn.HandshakeContext(ctx); err != nil {
|
||||||
|
// Errors from crypto/tls mostly aren't typed. We'll have to look for strings...
|
||||||
|
reason := "other"
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
reason = "eof"
|
||||||
|
} else if alert, ok := asTLSAlert(err); ok {
|
||||||
|
reason = tlsrpt.FormatAlert(alert)
|
||||||
|
} else {
|
||||||
|
s := err.Error()
|
||||||
|
if strings.Contains(s, "tls: client offered only unsupported versions") {
|
||||||
|
reason = "unsupportedversions"
|
||||||
|
} else if strings.Contains(s, "tls: first record does not look like a TLS handshake") {
|
||||||
|
reason = "nottls"
|
||||||
|
} else if strings.Contains(s, "tls: unsupported SSLv2 handshake received") {
|
||||||
|
reason = "sslv2"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
metricDeliveryStarttlsErrors.WithLabelValues(reason).Inc()
|
||||||
panic(fmt.Errorf("starttls handshake: %s (%w)", err, errIO))
|
panic(fmt.Errorf("starttls handshake: %s (%w)", err, errIO))
|
||||||
}
|
}
|
||||||
cancel()
|
cancel()
|
||||||
|
@ -971,6 +1006,22 @@ func (c *conn) cmdStarttls(p *parser) {
|
||||||
c.tls = true
|
c.tls = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func asTLSAlert(err error) (alert uint8, ok bool) {
|
||||||
|
// If the remote client aborts the connection, it can send an alert indicating why.
|
||||||
|
// crypto/tls gives us a net.OpError with "Op" set to "remote error", an an Err
|
||||||
|
// with the unexported type "alert", a uint8. So we try to read it.
|
||||||
|
|
||||||
|
var opErr *net.OpError
|
||||||
|
if !errors.As(err, &opErr) || opErr.Op != "remote error" || opErr.Err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
v := reflect.ValueOf(opErr.Err)
|
||||||
|
if v.Kind() != reflect.Uint8 || v.Type().Name() != "alert" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return uint8(v.Uint()), true
|
||||||
|
}
|
||||||
|
|
||||||
// ../rfc/4954:139
|
// ../rfc/4954:139
|
||||||
func (c *conn) cmdAuth(p *parser) {
|
func (c *conn) cmdAuth(p *parser) {
|
||||||
c.xneedHello()
|
c.xneedHello()
|
||||||
|
|
|
@ -10,7 +10,8 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func formatAlert(alert uint8) string {
|
// FormatAlert formats a TLS alert in the form "alert-<num>" or "alert-<num>-<shortcode>".
|
||||||
|
func FormatAlert(alert uint8) string {
|
||||||
s := fmt.Sprintf("alert-%d", alert)
|
s := fmt.Sprintf("alert-%d", alert)
|
||||||
err := tls.AlertError(alert) // Since go1.21.0
|
err := tls.AlertError(alert) // Since go1.21.0
|
||||||
// crypto/tls returns messages like "tls: short message" or "tls: alert(321)".
|
// crypto/tls returns messages like "tls: short message" or "tls: alert(321)".
|
||||||
|
|
|
@ -8,6 +8,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
)
|
)
|
||||||
|
|
||||||
func formatAlert(alert uint8) string {
|
// FormatAlert formats a TLS alert in the form "alert-<num>".
|
||||||
|
func FormatAlert(alert uint8) string {
|
||||||
return fmt.Sprintf("alert-%d", alert)
|
return fmt.Sprintf("alert-%d", alert)
|
||||||
}
|
}
|
||||||
|
|
|
@ -394,7 +394,7 @@ func TLSFailureDetails(err error) (ResultType, string) {
|
||||||
// todo: ideally, crypto/tls would let us check if this is an alert. it could be another uint8-typed error.
|
// todo: ideally, crypto/tls would let us check if this is an alert. it could be another uint8-typed error.
|
||||||
v := reflect.ValueOf(netErr.Err)
|
v := reflect.ValueOf(netErr.Err)
|
||||||
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
|
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
|
||||||
reasonCode = "tls-remote-" + formatAlert(uint8(v.Uint()))
|
reasonCode = "tls-remote-" + FormatAlert(uint8(v.Uint()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ResultValidationFailure, reasonCode
|
return ResultValidationFailure, reasonCode
|
||||||
|
@ -429,7 +429,7 @@ func TLSFailureDetails(err error) (ResultType, string) {
|
||||||
}
|
}
|
||||||
v := reflect.ValueOf(err)
|
v := reflect.ValueOf(err)
|
||||||
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
|
if v.Kind() == reflect.Uint8 && v.Type().Name() == "alert" {
|
||||||
reasonCode = "tls-local-" + formatAlert(uint8(v.Uint()))
|
reasonCode = "tls-local-" + FormatAlert(uint8(v.Uint()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ResultValidationFailure, reasonCode
|
return ResultValidationFailure, reasonCode
|
||||||
|
|
Loading…
Reference in a new issue