mox/mtastsdb/refresh.go

package mtastsdb

import (
	"context"
	"errors"
	"fmt"
	"log/slog"
	mathrand "math/rand"
	"runtime/debug"
	"time"

	"github.com/mjl-/bstore"

	"github.com/mjl-/mox/dns"
	"github.com/mjl-/mox/metrics"
	"github.com/mjl-/mox/mlog"
	"github.com/mjl-/mox/mox-"
	"github.com/mjl-/mox/mtasts"
)

func refresh() int {
	interval := 24 * time.Hour
	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	var refreshed int

	// Pro-actively refresh policies every 24 hours. ../rfc/8461:583
	for {
		ticker.Reset(interval)

		log := mlog.New("mtastsdb", nil).WithCid(mox.Cid())
		n, err := refresh1(mox.Context, log, dns.StrictResolver{Pkg: "mtastsdb"}, time.Sleep)
		log.Check(err, "periodic refresh of cached mtasts policies")
		if n > 0 {
			refreshed += n
		}

		select {
		case <-mox.Shutdown.Done():
			return refreshed
		case <-ticker.C:
		}
	}
}

// refresh policies that have not been updated in the past 12 hours and remove
// policies not used for 180 days. We start with the first domain immediately, so
// an admin can see any (configuration) issues that are logged. We spread the
// refreshes evenly over the next 3 hours, randomizing the domains, and we add some
// jitter to the timing. Each refresh is done in a new goroutine, so a single slow
// refresh doesn't mess up the timing.
func refresh1(ctx context.Context, log mlog.Log, resolver dns.Resolver, sleep func(d time.Duration)) (int, error) {
	now := timeNow()
	qdel := bstore.QueryDB[PolicyRecord](ctx, DB)
	qdel.FilterLess("LastUse", now.Add(-180*24*time.Hour))
	if _, err := qdel.Delete(); err != nil {
		return 0, fmt.Errorf("deleting old unused policies: %s", err)
	}

	qup := bstore.QueryDB[PolicyRecord](ctx, DB)
	qup.FilterLess("LastUpdate", now.Add(-12*time.Hour))
	prs, err := qup.List()
	if err != nil {
		return 0, fmt.Errorf("querying policies to refresh: %s", err)
	}

	if len(prs) == 0 {
		// Nothing to do.
		return 0, nil
	}

	// Randomize list.
	rand := mathrand.New(mathrand.NewSource(time.Now().UnixNano()))
	for i := range prs {
		if i == 0 {
			continue
		}
		j := rand.Intn(i + 1)
		prs[i], prs[j] = prs[j], prs[i]
	}

	// Launch goroutine with the refresh.
	log.Debug("will refresh mta-sts policies over next 3 hours", slog.Int("count", len(prs)))
	start := timeNow()
	for i, pr := range prs {
		go refreshDomain(ctx, log, DB, resolver, pr)
		if i < len(prs)-1 {
			interval := 3 * int64(time.Hour) / int64(len(prs)-1)
			extra := time.Duration(rand.Int63n(interval) - interval/2)
			next := start.Add(time.Duration(int64(i+1)*interval) + extra)
			d := next.Sub(timeNow())
			if d > 0 {
				sleep(d)
			}
		}
	}
	return len(prs), nil
}

func refreshDomain(ctx context.Context, log mlog.Log, db *bstore.DB, resolver dns.Resolver, pr PolicyRecord) {
	defer func() {
		x := recover()
		if x != nil {
			// Should not happen, but make sure errors don't take down the application.
			log.Error("refresh1", slog.Any("panic", x))
			debug.PrintStack()
			metrics.PanicInc(metrics.Mtastsdb)
		}
	}()

	ctx, cancel := context.WithTimeout(ctx, time.Minute)
	defer cancel()

	d, err := dns.ParseDomain(pr.Domain)
	if err != nil {
		log.Errorx("refreshing mta-sts policy: parsing policy domain", err, slog.Any("domain", d))
		return
	}
	log.Debug("refreshing mta-sts policy for domain", slog.Any("domain", d))
	record, _, err := mtasts.LookupRecord(ctx, log.Logger, resolver, d)
	if err == nil && record.ID == pr.RecordID {
		qup := bstore.QueryDB[PolicyRecord](ctx, db)
		qup.FilterNonzero(PolicyRecord{Domain: pr.Domain, LastUpdate: pr.LastUpdate})
		now := timeNow()
		update := PolicyRecord{
			LastUpdate: now,
			ValidEnd:   now.Add(time.Duration(pr.MaxAgeSeconds) * time.Second),
		}
		if n, err := qup.UpdateNonzero(update); err != nil {
			log.Errorx("updating refreshed, unmodified policy in database", err)
		} else if n != 1 {
			log.Info("expected to update 1 policy after refresh", slog.Int("count", n))
		}
		return
	}
	if err != nil && pr.Mode == mtasts.ModeNone {
		if errors.Is(err, mtasts.ErrNoRecord) {
			// Policy was in mode "none". Now it doesn't have a policy anymore. Remove from our
			// database so we don't keep refreshing it.
			err := db.Delete(ctx, &pr)
			log.Check(err, "removing mta-sts policy with mode none, dns record is gone")
		}
		// Else, don't bother operator with temporary error about policy none.
		// ../rfc/8461:587
		return
	} else if err != nil {
		log.Errorx("looking up mta-sts record for domain", err, slog.Any("domain", d))
		// Try to fetch new policy. It could be just DNS that is down. We don't want to let our policy expire.
	}

	p, _, err := mtasts.FetchPolicy(ctx, log.Logger, d)
	if err != nil {
		if !errors.Is(err, mtasts.ErrNoPolicy) || pr.Mode != mtasts.ModeNone {
			log.Errorx("refreshing mtasts policy for domain", err, slog.Any("domain", d))
		}
		return
	}
	now := timeNow()
	update := map[string]any{
		"LastUpdate": now,
		"ValidEnd":   now.Add(time.Duration(p.MaxAgeSeconds) * time.Second),
		"Backoff":    false,
		"Policy":     *p,
	}
	if record != nil {
		update["RecordID"] = record.ID
	}
	qup := bstore.QueryDB[PolicyRecord](ctx, db)
	qup.FilterNonzero(PolicyRecord{Domain: pr.Domain, LastUpdate: pr.LastUpdate})
	if n, err := qup.UpdateFields(update); err != nil {
		log.Errorx("updating refreshed, modified policy in database", err)
	} else if n != 1 {
		log.Info("updating refreshed, did not update 1 policy", slog.Int("count", n))
	}
}
mox! 2023-01-30 16:27:06 +03:00			`package mtastsdb`

			`import (`
			`"context"`
			`"errors"`
			`"fmt"`
replace packages slog and slices from golang.org/x/exp with stdlib since we are now at go1.21 as minimum. 2024-02-08 16:49:01 +03:00			`"log/slog"`
mox! 2023-01-30 16:27:06 +03:00			`mathrand "math/rand"`
			`"runtime/debug"`
			`"time"`

			`"github.com/mjl-/bstore"`

			`"github.com/mjl-/mox/dns"`
			`"github.com/mjl-/mox/metrics"`
			`"github.com/mjl-/mox/mlog"`
			`"github.com/mjl-/mox/mox-"`
			`"github.com/mjl-/mox/mtasts"`
			`)`

			`func refresh() int {`
			`interval := 24 * time.Hour`
			`ticker := time.NewTicker(interval)`
			`defer ticker.Stop()`

			`var refreshed int`

			`// Pro-actively refresh policies every 24 hours. ../rfc/8461:583`
			`for {`
			`ticker.Reset(interval)`

switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log := mlog.New("mtastsdb", nil).WithCid(mox.Cid())`
			`n, err := refresh1(mox.Context, log, dns.StrictResolver{Pkg: "mtastsdb"}, time.Sleep)`
			`log.Check(err, "periodic refresh of cached mtasts policies")`
mox! 2023-01-30 16:27:06 +03:00			`if n > 0 {`
			`refreshed += n`
			`}`

			`select {`
add funtionality to import zip/tgz with maildirs/mboxes to account page so users can easily take their email out of somewhere else, and import it into mox. this goes a little way to give feedback as the import progresses: upload progress is shown (surprisingly, browsers aren't doing this...), imported mailboxes/messages are counted (batched) and import issues/warnings are displayed, all sent over an SSE connection. an import token is stored in sessionstorage. if you reload the page (e.g. after a connection error), the browser will reconnect to the running import and show its progress again. and you can just abort the import before it is finished and committed, and nothing will have changed. this also imports flags/keywords from mbox files. 2023-02-16 11:57:27 +03:00			`case <-mox.Shutdown.Done():`
mox! 2023-01-30 16:27:06 +03:00			`return refreshed`
			`case <-ticker.C:`
			`}`
			`}`
			`}`

			`// refresh policies that have not been updated in the past 12 hours and remove`
			`// policies not used for 180 days. We start with the first domain immediately, so`
			`// an admin can see any (configuration) issues that are logged. We spread the`
			`// refreshes evenly over the next 3 hours, randomizing the domains, and we add some`
			`// jitter to the timing. Each refresh is done in a new goroutine, so a single slow`
			`// refresh doesn't mess up the timing.`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`func refresh1(ctx context.Context, log mlog.Log, resolver dns.Resolver, sleep func(d time.Duration)) (int, error) {`
mox! 2023-01-30 16:27:06 +03:00			`now := timeNow()`
add debug logging about bstore db schema upgrades bstore was updated to v0.0.6 to add this logging. this simplifies some of the db-handling code in mtastsdb,tlsrptdb,dmarcdb. we now call the package-level Init() and Close() in all tests properly. 2024-05-10 15:44:37 +03:00			`qdel := bstore.QueryDB[PolicyRecord](ctx, DB)`
mox! 2023-01-30 16:27:06 +03:00			`qdel.FilterLess("LastUse", now.Add(-18024time.Hour))`
			`if _, err := qdel.Delete(); err != nil {`
			`return 0, fmt.Errorf("deleting old unused policies: %s", err)`
			`}`

add debug logging about bstore db schema upgrades bstore was updated to v0.0.6 to add this logging. this simplifies some of the db-handling code in mtastsdb,tlsrptdb,dmarcdb. we now call the package-level Init() and Close() in all tests properly. 2024-05-10 15:44:37 +03:00			`qup := bstore.QueryDB[PolicyRecord](ctx, DB)`
mox! 2023-01-30 16:27:06 +03:00			`qup.FilterLess("LastUpdate", now.Add(-12*time.Hour))`
			`prs, err := qup.List()`
			`if err != nil {`
			`return 0, fmt.Errorf("querying policies to refresh: %s", err)`
			`}`

			`if len(prs) == 0 {`
			`// Nothing to do.`
			`return 0, nil`
			`}`

			`// Randomize list.`
			`rand := mathrand.New(mathrand.NewSource(time.Now().UnixNano()))`
			`for i := range prs {`
			`if i == 0 {`
			`continue`
			`}`
			`j := rand.Intn(i + 1)`
			`prs[i], prs[j] = prs[j], prs[i]`
			`}`

			`// Launch goroutine with the refresh.`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Debug("will refresh mta-sts policies over next 3 hours", slog.Int("count", len(prs)))`
mox! 2023-01-30 16:27:06 +03:00			`start := timeNow()`
			`for i, pr := range prs {`
add debug logging about bstore db schema upgrades bstore was updated to v0.0.6 to add this logging. this simplifies some of the db-handling code in mtastsdb,tlsrptdb,dmarcdb. we now call the package-level Init() and Close() in all tests properly. 2024-05-10 15:44:37 +03:00			`go refreshDomain(ctx, log, DB, resolver, pr)`
mox! 2023-01-30 16:27:06 +03:00			`if i < len(prs)-1 {`
			`interval := 3 * int64(time.Hour) / int64(len(prs)-1)`
			`extra := time.Duration(rand.Int63n(interval) - interval/2)`
			`next := start.Add(time.Duration(int64(i+1)*interval) + extra)`
			`d := next.Sub(timeNow())`
			`if d > 0 {`
			`sleep(d)`
			`}`
			`}`
			`}`
			`return len(prs), nil`
			`}`

switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`func refreshDomain(ctx context.Context, log mlog.Log, db *bstore.DB, resolver dns.Resolver, pr PolicyRecord) {`
mox! 2023-01-30 16:27:06 +03:00			`defer func() {`
			`x := recover()`
			`if x != nil {`
			`// Should not happen, but make sure errors don't take down the application.`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Error("refresh1", slog.Any("panic", x))`
mox! 2023-01-30 16:27:06 +03:00			`debug.PrintStack()`
initialize metric mox_panic_total with 0, so the alerting rule also catches the first panic for a label increase() and rate() don't seem to assume a previous value of 0 when a vector gets a first value for a label. you would think that an increase() on a first-value mox_panic_total{"..."}=1 would return 1, and similar for rate(), but that doesn't appear to be the behaviour. so we just explicitly initialize the count to 0 for each possible label value. mox has more vector metrics, but panics feels like the most important, and it's too much code to initialize them all, for all combinations of label values. there is probably a better way that fixes this for all cases... 2023-09-15 17:47:17 +03:00			`metrics.PanicInc(metrics.Mtastsdb)`
mox! 2023-01-30 16:27:06 +03:00			`}`
			`}()`

			`ctx, cancel := context.WithTimeout(ctx, time.Minute)`
			`defer cancel()`

			`d, err := dns.ParseDomain(pr.Domain)`
			`if err != nil {`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Errorx("refreshing mta-sts policy: parsing policy domain", err, slog.Any("domain", d))`
mox! 2023-01-30 16:27:06 +03:00			`return`
			`}`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Debug("refreshing mta-sts policy for domain", slog.Any("domain", d))`
			`record, _, err := mtasts.LookupRecord(ctx, log.Logger, resolver, d)`
mox! 2023-01-30 16:27:06 +03:00			`if err == nil && record.ID == pr.RecordID {`
update to latest bstore (with support for an index on a []string: Message.DKIMDomains), and cyclic data types (to be used for Message.Part soon); also adds a context.Context to database operations. 2023-05-22 15:40:36 +03:00			`qup := bstore.QueryDB[PolicyRecord](ctx, db)`
mox! 2023-01-30 16:27:06 +03:00			`qup.FilterNonzero(PolicyRecord{Domain: pr.Domain, LastUpdate: pr.LastUpdate})`
			`now := timeNow()`
			`update := PolicyRecord{`
			`LastUpdate: now,`
			`ValidEnd: now.Add(time.Duration(pr.MaxAgeSeconds) * time.Second),`
			`}`
			`if n, err := qup.UpdateNonzero(update); err != nil {`
			`log.Errorx("updating refreshed, unmodified policy in database", err)`
			`} else if n != 1 {`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Info("expected to update 1 policy after refresh", slog.Int("count", n))`
mox! 2023-01-30 16:27:06 +03:00			`}`
			`return`
			`}`
			`if err != nil && pr.Mode == mtasts.ModeNone {`
implement outgoing tls reports we were already accepting, processing and displaying incoming tls reports. now we start tracking TLS connection and security-policy-related errors for outgoing message deliveries as well. we send reports once a day, to the reporting addresses specified in TLSRPT records (rua) of a policy domain. these reports are about MTA-STS policies and/or DANE policies, and about STARTTLS-related failures. sending reports is enabled by default, but can be disabled through setting NoOutgoingTLSReports in mox.conf. only at the end of the implementation process came the realization that the TLSRPT policy domain for DANE (MX) hosts are separate from the TLSRPT policy for the recipient domain, and that MTA-STS and DANE TLS/policy results are typically delivered in separate reports. so MX hosts need their own TLSRPT policies. config for the per-host TLSRPT policy should be added to mox.conf for existing installs, in field HostTLSRPT. it is automatically configured by quickstart for new installs. with a HostTLSRPT config, the "dns records" and "dns check" admin pages now suggest the per-host TLSRPT record. by creating that record, you're requesting TLS reports about your MX host. gathering all the TLS/policy results is somewhat tricky. the tentacles go throughout the code. the positive result is that the TLS/policy-related code had to be cleaned up a bit. for example, the smtpclient TLS modes now reflect reality better, with independent settings about whether PKIX and/or DANE verification has to be done, and/or whether verification errors have to be ignored (e.g. for tls-required: no header). also, cached mtasts policies of mode "none" are now cleaned up once the MTA-STS DNS record goes away. 2023-11-09 19:40:46 +03:00			`if errors.Is(err, mtasts.ErrNoRecord) {`
			`// Policy was in mode "none". Now it doesn't have a policy anymore. Remove from our`
			`// database so we don't keep refreshing it.`
			`err := db.Delete(ctx, &pr)`
			`log.Check(err, "removing mta-sts policy with mode none, dns record is gone")`
			`}`
			`// Else, don't bother operator with temporary error about policy none.`
			`// ../rfc/8461:587`
mox! 2023-01-30 16:27:06 +03:00			`return`
			`} else if err != nil {`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Errorx("looking up mta-sts record for domain", err, slog.Any("domain", d))`
mox! 2023-01-30 16:27:06 +03:00			`// Try to fetch new policy. It could be just DNS that is down. We don't want to let our policy expire.`
			`}`

switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`p, _, err := mtasts.FetchPolicy(ctx, log.Logger, d)`
mox! 2023-01-30 16:27:06 +03:00			`if err != nil {`
			`if !errors.Is(err, mtasts.ErrNoPolicy) \|\| pr.Mode != mtasts.ModeNone {`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Errorx("refreshing mtasts policy for domain", err, slog.Any("domain", d))`
mox! 2023-01-30 16:27:06 +03:00			`}`
			`return`
			`}`
			`now := timeNow()`
			`update := map[string]any{`
			`"LastUpdate": now,`
			`"ValidEnd": now.Add(time.Duration(p.MaxAgeSeconds) * time.Second),`
			`"Backoff": false,`
			`"Policy": *p,`
			`}`
			`if record != nil {`
			`update["RecordID"] = record.ID`
			`}`
update to latest bstore (with support for an index on a []string: Message.DKIMDomains), and cyclic data types (to be used for Message.Part soon); also adds a context.Context to database operations. 2023-05-22 15:40:36 +03:00			`qup := bstore.QueryDB[PolicyRecord](ctx, db)`
mox! 2023-01-30 16:27:06 +03:00			`qup.FilterNonzero(PolicyRecord{Domain: pr.Domain, LastUpdate: pr.LastUpdate})`
			`if n, err := qup.UpdateFields(update); err != nil {`
			`log.Errorx("updating refreshed, modified policy in database", err)`
			`} else if n != 1 {`
switch to slog.Logger for logging, for easier reuse of packages by external software we don't want external software to include internal details like mlog. slog.Logger is/will be the standard. we still have mlog for its helper functions, and its handler that logs in concise logfmt used by mox. packages that are not meant for reuse still pass around mlog.Log for convenience. we use golang.org/x/exp/slog because we also support the previous Go toolchain version. with the next Go release, we'll switch to the builtin slog. 2023-12-05 15:35:58 +03:00			`log.Info("updating refreshed, did not update 1 policy", slog.Int("count", n))`
mox! 2023-01-30 16:27:06 +03:00			`}`
			`}`