mirror of
https://github.com/mjl-/mox.git
synced 2024-12-25 16:03:48 +03:00
32b549b260
we add various information while analysing an incoming message. like dkim/spf/ip reputation. and content-based junk filter threshold/result and ham/spam words used. for issue #179 by Fell and #157 by mattfbacon
430 lines
16 KiB
Go
430 lines
16 KiB
Go
package smtpserver
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/mjl-/bstore"
|
|
|
|
"github.com/mjl-/mox/dns"
|
|
"github.com/mjl-/mox/mlog"
|
|
"github.com/mjl-/mox/smtp"
|
|
"github.com/mjl-/mox/store"
|
|
)
|
|
|
|
type reputationMethod string
|
|
|
|
const (
|
|
methodMsgfromFull reputationMethod = "msgfromfull"
|
|
methodMsgtoFull reputationMethod = "msgtofull"
|
|
methodMsgfromDomain reputationMethod = "msgfromdomain"
|
|
methodMsgfromOrgDomain reputationMethod = "msgfromorgdomain"
|
|
methodMsgtoDomain reputationMethod = "msgtodomain"
|
|
methodMsgtoOrgDomain reputationMethod = "msgtoorgdomain"
|
|
methodDKIMSPF reputationMethod = "dkimspf"
|
|
methodIP1 reputationMethod = "ip1"
|
|
methodIP2 reputationMethod = "ip2"
|
|
methodIP3 reputationMethod = "ip3"
|
|
methodNone reputationMethod = "none"
|
|
)
|
|
|
|
// Reputation returns whether message m is likely junk.
|
|
//
|
|
// This function is called after checking for a manually configured spf mailfrom
|
|
// allow (e.g. for mailing lists), and after checking for a dmarc reject policy.
|
|
//
|
|
// The decision is made based on historic messages delivered to the same
|
|
// destination mailbox, MailboxOrigID. Because each mailbox may have a different
|
|
// accept policy. We only use messages that have been marked as either junk or
|
|
// non-junk. We help users by automatically marking them as non-junk when moving to
|
|
// certain folders in the default config (e.g. the archive folder). We expect users
|
|
// to mark junk messages as such when they read it. And to keep it in their inbox,
|
|
// regular trash or archive if it is not.
|
|
//
|
|
// The basic idea is to keep accepting messages that were accepted in the past, and
|
|
// keep rejecting those that were rejected. This is relatively easy to check if
|
|
// mail passes SPF and/or DKIM with Message-From alignment. Regular email from
|
|
// known people will be let in. But spammers are trickier. They will use new IPs,
|
|
// (sub)domains, no or newly created SPF and/or DKIM identifiers, new localparts,
|
|
// etc. This function likely ends up returning "inconclusive" for such emails. The
|
|
// junkfilter will have to take care of a final decision.
|
|
//
|
|
// In case of doubt, it doesn't hurt much to accept another mail that a user has
|
|
// communicated successfully with in the past. If the most recent message is marked
|
|
// as junk that could have happened accidentally. If another message is let in, and
|
|
// it is again junk, future messages will be rejected.
|
|
//
|
|
// Actual spammers will probably try to use identifiers, i.e. (sub)domain, dkim/spf
|
|
// identifiers and ip addresses for which we have no history. We may only have
|
|
// ip-based reputation, perhaps only an ip range, perhaps nothing.
|
|
//
|
|
// Some profiles of first-time senders:
|
|
//
|
|
// - Individuals. They can typically get past the junkfilter if needed.
|
|
// - Transactional emails. They should get past the junkfilter. If they use one of
|
|
// the larger email service providers, their reputation could help. If the
|
|
// junkfilter rejects the message, users can recover the message from the Rejects
|
|
// mailbox. The first message is typically initiated by a user, e.g. by registering.
|
|
// - Desired commercial email will have to get past the junkfilter based on its
|
|
// content. There will typically be earlier communication with the (organizational)
|
|
// domain that would let the message through.
|
|
// - Mailing list. May get past the junkfilter. If delivery is to a separate
|
|
// mailbox, the junkfilter will let it in because of little history. Long enough to
|
|
// build reputation based on DKIM/SPF signals. Users are best off to
|
|
// configure accept rules for messages from mailing lists.
|
|
//
|
|
// The decision-making process looks at historic messages. The following properties
|
|
// are checked until matching messages are found. If they are found, a decision is
|
|
// returned, which may be inconclusive. The next property on the list is only
|
|
// checked if a step did not match any messages.
|
|
//
|
|
// - Messages matching full "message from" address, either with strict/relaxed
|
|
// dkim/spf-verification, or without.
|
|
// - Messages the user sent to the "message from" address.
|
|
// - Messages matching only the domain of the "message from" address (different
|
|
// localpart), again with verification or without.
|
|
// - Messages sent to an address in the domain of the "message from" address.
|
|
// - The previous two checks again, but now checking against the organizational
|
|
// domain instead of the exact domain.
|
|
// - Matching DKIM domains and a matching SPF mailfrom, or mailfrom domain, or ehlo
|
|
// domain.
|
|
// - "Exact" IP, or nearby IPs.
|
|
//
|
|
// References:
|
|
// ../rfc/5863
|
|
// ../rfc/7960
|
|
// ../rfc/6376:1915
|
|
// ../rfc/6376:3716
|
|
// ../rfc/7208:2167
|
|
func reputation(tx *bstore.Tx, log mlog.Log, m *store.Message, smtputf8 bool) (rjunk *bool, rconclusive bool, rmethod reputationMethod, reasonText string, rerr error) {
|
|
boolptr := func(v bool) *bool {
|
|
return &v
|
|
}
|
|
xfalse := boolptr(false)
|
|
xtrue := boolptr(true)
|
|
|
|
type queryError string
|
|
|
|
defer func() {
|
|
x := recover()
|
|
if x == nil {
|
|
return
|
|
}
|
|
if xerr, ok := x.(queryError); ok {
|
|
rerr = errors.New(string(xerr))
|
|
return
|
|
}
|
|
panic(x)
|
|
}()
|
|
|
|
now := time.Now()
|
|
|
|
// messageQuery returns a base query for historic seen messages to the same
|
|
// mailbox, at most maxAge old, and at most maxCount messages.
|
|
messageQuery := func(fm *store.Message, maxAge time.Duration, maxCount int) *bstore.Query[store.Message] {
|
|
q := bstore.QueryTx[store.Message](tx)
|
|
q.FilterEqual("MailboxOrigID", m.MailboxID)
|
|
q.FilterEqual("Expunged", false)
|
|
q.FilterFn(func(m store.Message) bool {
|
|
return m.Junk || m.Notjunk
|
|
})
|
|
if fm != nil {
|
|
q.FilterNonzero(*fm)
|
|
}
|
|
q.FilterGreaterEqual("Received", now.Add(-maxAge))
|
|
q.Limit(maxCount)
|
|
q.SortDesc("Received")
|
|
return q
|
|
}
|
|
|
|
// Execute the query, returning messages or returning error through panic.
|
|
xmessageList := func(q *bstore.Query[store.Message], descr string) []store.Message {
|
|
t0 := time.Now()
|
|
l, err := q.List()
|
|
log.Debugx("querying messages for reputation", err,
|
|
slog.Int("msgs", len(l)),
|
|
slog.String("descr", descr),
|
|
slog.Duration("queryduration", time.Since(t0)))
|
|
if err != nil {
|
|
panic(queryError(fmt.Sprintf("listing messages: %v", err)))
|
|
}
|
|
return l
|
|
}
|
|
|
|
xrecipientExists := func(q *bstore.Query[store.Recipient]) bool {
|
|
exists, err := q.Exists()
|
|
if err != nil {
|
|
panic(queryError(fmt.Sprintf("checking for recipient: %v", err)))
|
|
}
|
|
return exists
|
|
}
|
|
|
|
const year = 365 * 24 * time.Hour
|
|
|
|
// Look for historic messages with same "message from" address. We'll
|
|
// treat any validation (strict/dmarc/relaxed) the same, but "none"
|
|
// separately.
|
|
//
|
|
// We only need 1 message, and sometimes look at a second message. If
|
|
// the last message or the message before was an accept, we accept. If
|
|
// the single last or last two were a reject, we reject.
|
|
//
|
|
// If there was no validation, any signal is inconclusive.
|
|
if m.MsgFromDomain != "" {
|
|
q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain}, 3*year, 2)
|
|
q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
|
|
msgs := xmessageList(q, "mgsfromfull")
|
|
if len(msgs) > 0 {
|
|
// todo: we may want to look at dkim/spf in this case.
|
|
spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
|
|
conclusive := m.MsgFromValidated
|
|
return &spam, conclusive, methodMsgfromFull, "reputation of exact message-from address", nil
|
|
}
|
|
if !m.MsgFromValidated {
|
|
// Look for historic messages that were validated. If present, this is likely spam.
|
|
// Only return as conclusively spam if history also says this From-address sent
|
|
// spam.
|
|
q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain, MsgFromValidated: true}, 3*year, 2)
|
|
msgs = xmessageList(q, "msgfromfull-validated")
|
|
if len(msgs) > 0 {
|
|
spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
|
|
return xtrue, spam, methodMsgfromFull, "unvalidated message with validated historic messages with exact message-from address", nil
|
|
}
|
|
}
|
|
|
|
// Look if we ever sent to this address. If so, we accept,
|
|
qr := bstore.QueryTx[store.Recipient](tx)
|
|
qr.FilterEqual("Localpart", m.MsgFromLocalpart)
|
|
qr.FilterEqual("Domain", m.MsgFromDomain)
|
|
qr.FilterGreaterEqual("Sent", now.Add(-3*year))
|
|
if xrecipientExists(qr) {
|
|
return xfalse, true, methodMsgtoFull, "exact message-from address was earlier message recipient", nil
|
|
}
|
|
|
|
// Look for domain match, then for organizational domain match.
|
|
for _, orgdomain := range []bool{false, true} {
|
|
qm := store.Message{}
|
|
var method reputationMethod
|
|
var source, descr string
|
|
if orgdomain {
|
|
qm.MsgFromOrgDomain = m.MsgFromOrgDomain
|
|
method = methodMsgfromOrgDomain
|
|
source = "organizational domain of message-from address"
|
|
descr = "msgfromorgdomain"
|
|
} else {
|
|
qm.MsgFromDomain = m.MsgFromDomain
|
|
method = methodMsgfromDomain
|
|
source = "exact domain of message-from address"
|
|
descr = "msgfromdomain"
|
|
}
|
|
|
|
q := messageQuery(&qm, 2*year, 20)
|
|
q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
|
|
msgs := xmessageList(q, descr)
|
|
if len(msgs) > 0 {
|
|
nonjunk := 0
|
|
for _, m := range msgs {
|
|
if !m.Junk {
|
|
nonjunk++
|
|
}
|
|
}
|
|
if 100*nonjunk/len(msgs) > 80 {
|
|
reasonText = fmt.Sprintf("positive reputation with %s based on %d messages", source, len(msgs))
|
|
return xfalse, true, method, reasonText, nil
|
|
}
|
|
if nonjunk == 0 {
|
|
// Only conclusive with at least 3 different localparts.
|
|
localparts := map[smtp.Localpart]struct{}{}
|
|
for _, m := range msgs {
|
|
localparts[m.MsgFromLocalpart] = struct{}{}
|
|
if len(localparts) == 3 {
|
|
reasonText = fmt.Sprintf("negative reputation of at least 3 addresses with %s based on %d messages", source, len(msgs))
|
|
return xtrue, true, method, reasonText, nil
|
|
}
|
|
}
|
|
reasonText = fmt.Sprintf("negative reputation with %s based on %d messages", source, len(msgs))
|
|
return xtrue, false, method, reasonText, nil
|
|
}
|
|
// Mixed signals from domain. We don't want to block a new sender.
|
|
reasonText = fmt.Sprintf("mixed signals with %s based on %d messages", source, len(msgs))
|
|
return nil, false, method, reasonText, nil
|
|
}
|
|
if !m.MsgFromValidated {
|
|
// Look for historic messages that were validated. If present, this is likely spam.
|
|
// Only return as conclusively spam if history also says this From-address sent
|
|
// spam.
|
|
q := messageQuery(&qm, 2*year, 2)
|
|
q.FilterEqual("MsgFromValidated", true)
|
|
msgs = xmessageList(q, descr+"-validated")
|
|
if len(msgs) > 0 {
|
|
spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
|
|
reasonText = fmt.Sprintf("unvalidated message with %s while we have validated messages from that source", source)
|
|
return xtrue, spam, method, reasonText, nil
|
|
}
|
|
}
|
|
|
|
// Look if we ever sent to this address. If so, we accept,
|
|
qr := bstore.QueryTx[store.Recipient](tx)
|
|
if orgdomain {
|
|
qr.FilterEqual("OrgDomain", m.MsgFromOrgDomain)
|
|
method = methodMsgtoOrgDomain
|
|
source = "organizational domain of message-from address"
|
|
} else {
|
|
qr.FilterEqual("Domain", m.MsgFromDomain)
|
|
method = methodMsgtoDomain
|
|
source = "exact domain of message-from address"
|
|
}
|
|
qr.FilterGreaterEqual("Sent", now.Add(-2*year))
|
|
if xrecipientExists(qr) {
|
|
reasonText = fmt.Sprintf("%s was recipient address", source)
|
|
return xfalse, true, method, reasonText, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
// DKIM and SPF.
|
|
// We only use identities that passed validation. Failed identities are ignored. ../rfc/6376:2447
|
|
// todo future: we could do something with the DKIM identity (i=) field if it is more specific than just the domain (d=).
|
|
dkimspfsignals := []float64{}
|
|
dkimspfreasondoms := []string{}
|
|
dkimspfmsgs := 0
|
|
for _, dom := range m.DKIMDomains {
|
|
q := messageQuery(nil, year/2, 50)
|
|
q.FilterIn("DKIMDomains", dom)
|
|
msgs := xmessageList(q, "dkimdomain")
|
|
if len(msgs) > 0 {
|
|
nspam := 0
|
|
for _, m := range msgs {
|
|
if m.Junk {
|
|
nspam++
|
|
}
|
|
}
|
|
pspam := float64(nspam) / float64(len(msgs))
|
|
dkimspfsignals = append(dkimspfsignals, pspam)
|
|
dkimspfreasondoms = append(dkimspfreasondoms, dom)
|
|
dkimspfmsgs = len(msgs)
|
|
}
|
|
}
|
|
if m.MailFromValidated || m.EHLOValidated {
|
|
var dom string
|
|
var msgs []store.Message
|
|
if m.MailFromValidated && m.MailFromDomain != "" {
|
|
dom = m.MailFromDomain
|
|
q := messageQuery(&store.Message{MailFromLocalpart: m.MailFromLocalpart, MailFromDomain: m.MailFromDomain}, year/2, 50)
|
|
msgs = xmessageList(q, "mailfrom")
|
|
if len(msgs) == 0 {
|
|
q := messageQuery(&store.Message{MailFromDomain: m.MailFromDomain}, year/2, 50)
|
|
msgs = xmessageList(q, "mailfromdomain")
|
|
}
|
|
}
|
|
if len(msgs) == 0 && m.EHLOValidated && m.EHLODomain != "" {
|
|
dom = m.EHLODomain
|
|
q := messageQuery(&store.Message{EHLODomain: m.EHLODomain}, year/2, 50)
|
|
msgs = xmessageList(q, "ehlodomain")
|
|
}
|
|
if len(msgs) > 0 {
|
|
nspam := 0
|
|
for _, m := range msgs {
|
|
if m.Junk {
|
|
nspam++
|
|
}
|
|
}
|
|
pspam := float64(nspam) / float64(len(msgs))
|
|
dkimspfsignals = append(dkimspfsignals, pspam)
|
|
dkimspfreasondoms = append(dkimspfreasondoms, dom)
|
|
if len(msgs) > dkimspfmsgs {
|
|
dkimspfmsgs = len(msgs)
|
|
}
|
|
}
|
|
}
|
|
if len(dkimspfsignals) > 0 {
|
|
var nham, nspam int
|
|
var hamdoms, spamdoms []string
|
|
for i, p := range dkimspfsignals {
|
|
d, _ := dns.ParseDomain(dkimspfreasondoms[i])
|
|
if p < .1 {
|
|
nham++
|
|
hamdoms = append(hamdoms, d.XName(smtputf8))
|
|
} else if p > .9 {
|
|
nspam++
|
|
spamdoms = append(spamdoms, d.XName(smtputf8))
|
|
}
|
|
}
|
|
if nham > 0 && nspam == 0 {
|
|
reasonText = fmt.Sprintf("positive dkim/spf reputation for domain(s) %s", strings.Join(hamdoms, ","))
|
|
return xfalse, true, methodDKIMSPF, reasonText, nil
|
|
}
|
|
if nspam > 0 && nham == 0 {
|
|
reasonText = fmt.Sprintf("negative dkim/spf reputation for domain(s) %s", strings.Join(hamdoms, ","))
|
|
return xtrue, dkimspfmsgs > 1, methodDKIMSPF, reasonText, nil
|
|
}
|
|
reasonText = fmt.Sprintf("mixed dkim/spf reputation, positive for %s, negative for %s", strings.Join(hamdoms, ","), strings.Join(spamdoms, ","))
|
|
return nil, false, methodDKIMSPF, reasonText, nil
|
|
}
|
|
|
|
// IP-based. A wider mask needs more messages to be conclusive.
|
|
// We require the resulting signal to be strong, i.e. likely ham or likely spam.
|
|
var msgs []store.Message
|
|
var need int
|
|
var method reputationMethod
|
|
var ip string
|
|
if m.RemoteIPMasked1 != "" {
|
|
q := messageQuery(&store.Message{RemoteIPMasked1: m.RemoteIPMasked1}, year/4, 50)
|
|
msgs = xmessageList(q, "ip1")
|
|
need = 2
|
|
method = methodIP1
|
|
ip = m.RemoteIPMasked1
|
|
}
|
|
if len(msgs) == 0 && m.RemoteIPMasked2 != "" {
|
|
q := messageQuery(&store.Message{RemoteIPMasked2: m.RemoteIPMasked2}, year/4, 50)
|
|
msgs = xmessageList(q, "ip2")
|
|
need = 5
|
|
method = methodIP2
|
|
ip = m.RemoteIPMasked2
|
|
}
|
|
if len(msgs) == 0 && m.RemoteIPMasked3 != "" {
|
|
q := messageQuery(&store.Message{RemoteIPMasked3: m.RemoteIPMasked3}, year/4, 50)
|
|
msgs = xmessageList(q, "ip3")
|
|
need = 10
|
|
method = methodIP3
|
|
ip = m.RemoteIPMasked3
|
|
}
|
|
if len(msgs) > 0 {
|
|
nspam := 0
|
|
for _, m := range msgs {
|
|
if m.Junk {
|
|
nspam++
|
|
}
|
|
}
|
|
pspam := float64(nspam) / float64(len(msgs))
|
|
var spam *bool
|
|
if pspam < .25 {
|
|
spam = xfalse
|
|
} else if pspam > .75 {
|
|
spam = xtrue
|
|
}
|
|
conclusive := len(msgs) >= need && (pspam <= 0.1 || pspam >= 0.9)
|
|
v6 := strings.Contains(m.RemoteIP, ":")
|
|
reasonText = fmt.Sprintf("reputation for ip %s%s, spam score %.2f", ip, maskclasses[classmask{v6, method}], pspam)
|
|
return spam, conclusive, method, reasonText, nil
|
|
}
|
|
|
|
return nil, false, methodNone, "no address/spf/dkim/ip reputation", nil
|
|
}
|
|
|
|
type classmask struct {
|
|
v6 bool
|
|
method reputationMethod
|
|
}
|
|
|
|
var maskclasses = map[classmask]string{
|
|
{false, methodIP1}: "/32",
|
|
{false, methodIP2}: "/26",
|
|
{false, methodIP3}: "/21",
|
|
{true, methodIP1}: "/64",
|
|
{true, methodIP2}: "/48",
|
|
{true, methodIP3}: "/32",
|
|
}
|