mirror of
https://github.com/mjl-/mox.git
synced 2025-01-14 01:06:27 +03:00
893a6f8911
we were already accepting, processing and displaying incoming tls reports. now we start tracking TLS connection and security-policy-related errors for outgoing message deliveries as well. we send reports once a day, to the reporting addresses specified in TLSRPT records (rua) of a policy domain. these reports are about MTA-STS policies and/or DANE policies, and about STARTTLS-related failures. sending reports is enabled by default, but can be disabled through setting NoOutgoingTLSReports in mox.conf. only at the end of the implementation process came the realization that the TLSRPT policy domain for DANE (MX) hosts are separate from the TLSRPT policy for the recipient domain, and that MTA-STS and DANE TLS/policy results are typically delivered in separate reports. so MX hosts need their own TLSRPT policies. config for the per-host TLSRPT policy should be added to mox.conf for existing installs, in field HostTLSRPT. it is automatically configured by quickstart for new installs. with a HostTLSRPT config, the "dns records" and "dns check" admin pages now suggest the per-host TLSRPT record. by creating that record, you're requesting TLS reports about your MX host. gathering all the TLS/policy results is somewhat tricky. the tentacles go throughout the code. the positive result is that the TLS/policy-related code had to be cleaned up a bit. for example, the smtpclient TLS modes now reflect reality better, with independent settings about whether PKIX and/or DANE verification has to be done, and/or whether verification errors have to be ignored (e.g. for tls-required: no header). also, cached mtasts policies of mode "none" are now cleaned up once the MTA-STS DNS record goes away.
183 lines
5.4 KiB
Go
183 lines
5.4 KiB
Go
package mtastsdb
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
mathrand "math/rand"
|
|
"runtime/debug"
|
|
"time"
|
|
|
|
"github.com/mjl-/bstore"
|
|
|
|
"github.com/mjl-/mox/dns"
|
|
"github.com/mjl-/mox/metrics"
|
|
"github.com/mjl-/mox/mlog"
|
|
"github.com/mjl-/mox/mox-"
|
|
"github.com/mjl-/mox/mtasts"
|
|
)
|
|
|
|
func refresh() int {
|
|
interval := 24 * time.Hour
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
var refreshed int
|
|
|
|
// Pro-actively refresh policies every 24 hours. ../rfc/8461:583
|
|
for {
|
|
ticker.Reset(interval)
|
|
|
|
ctx := context.WithValue(mox.Context, mlog.CidKey, mox.Cid())
|
|
n, err := refresh1(ctx, dns.StrictResolver{Pkg: "mtastsdb"}, time.Sleep)
|
|
if err != nil {
|
|
xlog.WithContext(ctx).Errorx("periodic refresh of cached mtasts policies", err)
|
|
}
|
|
if n > 0 {
|
|
refreshed += n
|
|
}
|
|
|
|
select {
|
|
case <-mox.Shutdown.Done():
|
|
return refreshed
|
|
case <-ticker.C:
|
|
}
|
|
}
|
|
}
|
|
|
|
// refresh policies that have not been updated in the past 12 hours and remove
|
|
// policies not used for 180 days. We start with the first domain immediately, so
|
|
// an admin can see any (configuration) issues that are logged. We spread the
|
|
// refreshes evenly over the next 3 hours, randomizing the domains, and we add some
|
|
// jitter to the timing. Each refresh is done in a new goroutine, so a single slow
|
|
// refresh doesn't mess up the timing.
|
|
func refresh1(ctx context.Context, resolver dns.Resolver, sleep func(d time.Duration)) (int, error) {
|
|
db, err := database(ctx)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
now := timeNow()
|
|
qdel := bstore.QueryDB[PolicyRecord](ctx, db)
|
|
qdel.FilterLess("LastUse", now.Add(-180*24*time.Hour))
|
|
if _, err := qdel.Delete(); err != nil {
|
|
return 0, fmt.Errorf("deleting old unused policies: %s", err)
|
|
}
|
|
|
|
qup := bstore.QueryDB[PolicyRecord](ctx, db)
|
|
qup.FilterLess("LastUpdate", now.Add(-12*time.Hour))
|
|
prs, err := qup.List()
|
|
if err != nil {
|
|
return 0, fmt.Errorf("querying policies to refresh: %s", err)
|
|
}
|
|
|
|
if len(prs) == 0 {
|
|
// Nothing to do.
|
|
return 0, nil
|
|
}
|
|
|
|
// Randomize list.
|
|
rand := mathrand.New(mathrand.NewSource(time.Now().UnixNano()))
|
|
for i := range prs {
|
|
if i == 0 {
|
|
continue
|
|
}
|
|
j := rand.Intn(i + 1)
|
|
prs[i], prs[j] = prs[j], prs[i]
|
|
}
|
|
|
|
// Launch goroutine with the refresh.
|
|
xlog.WithContext(ctx).Debug("will refresh mta-sts policies over next 3 hours", mlog.Field("count", len(prs)))
|
|
start := timeNow()
|
|
for i, pr := range prs {
|
|
go refreshDomain(ctx, db, resolver, pr)
|
|
if i < len(prs)-1 {
|
|
interval := 3 * int64(time.Hour) / int64(len(prs)-1)
|
|
extra := time.Duration(rand.Int63n(interval) - interval/2)
|
|
next := start.Add(time.Duration(int64(i+1)*interval) + extra)
|
|
d := next.Sub(timeNow())
|
|
if d > 0 {
|
|
sleep(d)
|
|
}
|
|
}
|
|
}
|
|
return len(prs), nil
|
|
}
|
|
|
|
func refreshDomain(ctx context.Context, db *bstore.DB, resolver dns.Resolver, pr PolicyRecord) {
|
|
log := xlog.WithContext(ctx)
|
|
defer func() {
|
|
x := recover()
|
|
if x != nil {
|
|
// Should not happen, but make sure errors don't take down the application.
|
|
log.Error("refresh1", mlog.Field("panic", x))
|
|
debug.PrintStack()
|
|
metrics.PanicInc(metrics.Mtastsdb)
|
|
}
|
|
}()
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, time.Minute)
|
|
defer cancel()
|
|
|
|
d, err := dns.ParseDomain(pr.Domain)
|
|
if err != nil {
|
|
log.Errorx("refreshing mta-sts policy: parsing policy domain", err, mlog.Field("domain", d))
|
|
return
|
|
}
|
|
log.Debug("refreshing mta-sts policy for domain", mlog.Field("domain", d))
|
|
record, _, err := mtasts.LookupRecord(ctx, resolver, d)
|
|
if err == nil && record.ID == pr.RecordID {
|
|
qup := bstore.QueryDB[PolicyRecord](ctx, db)
|
|
qup.FilterNonzero(PolicyRecord{Domain: pr.Domain, LastUpdate: pr.LastUpdate})
|
|
now := timeNow()
|
|
update := PolicyRecord{
|
|
LastUpdate: now,
|
|
ValidEnd: now.Add(time.Duration(pr.MaxAgeSeconds) * time.Second),
|
|
}
|
|
if n, err := qup.UpdateNonzero(update); err != nil {
|
|
log.Errorx("updating refreshed, unmodified policy in database", err)
|
|
} else if n != 1 {
|
|
log.Info("expected to update 1 policy after refresh", mlog.Field("count", n))
|
|
}
|
|
return
|
|
}
|
|
if err != nil && pr.Mode == mtasts.ModeNone {
|
|
if errors.Is(err, mtasts.ErrNoRecord) {
|
|
// Policy was in mode "none". Now it doesn't have a policy anymore. Remove from our
|
|
// database so we don't keep refreshing it.
|
|
err := db.Delete(ctx, &pr)
|
|
log.Check(err, "removing mta-sts policy with mode none, dns record is gone")
|
|
}
|
|
// Else, don't bother operator with temporary error about policy none.
|
|
// ../rfc/8461:587
|
|
return
|
|
} else if err != nil {
|
|
log.Errorx("looking up mta-sts record for domain", err, mlog.Field("domain", d))
|
|
// Try to fetch new policy. It could be just DNS that is down. We don't want to let our policy expire.
|
|
}
|
|
|
|
p, _, err := mtasts.FetchPolicy(ctx, d)
|
|
if err != nil {
|
|
if !errors.Is(err, mtasts.ErrNoPolicy) || pr.Mode != mtasts.ModeNone {
|
|
log.Errorx("refreshing mtasts policy for domain", err, mlog.Field("domain", d))
|
|
}
|
|
return
|
|
}
|
|
now := timeNow()
|
|
update := map[string]any{
|
|
"LastUpdate": now,
|
|
"ValidEnd": now.Add(time.Duration(p.MaxAgeSeconds) * time.Second),
|
|
"Backoff": false,
|
|
"Policy": *p,
|
|
}
|
|
if record != nil {
|
|
update["RecordID"] = record.ID
|
|
}
|
|
qup := bstore.QueryDB[PolicyRecord](ctx, db)
|
|
qup.FilterNonzero(PolicyRecord{Domain: pr.Domain, LastUpdate: pr.LastUpdate})
|
|
if n, err := qup.UpdateFields(update); err != nil {
|
|
log.Errorx("updating refreshed, modified policy in database", err)
|
|
} else if n != 1 {
|
|
log.Info("updating refreshed, did not update 1 policy", mlog.Field("count", n))
|
|
}
|
|
}
|