From 3620d6f05e5e42984546b3a9078f05bd1403ca67 Mon Sep 17 00:00:00 2001 From: Mechiel Lukkien Date: Fri, 15 Sep 2023 16:47:17 +0200 Subject: [PATCH] initialize metric mox_panic_total with 0, so the alerting rule also catches the first panic for a label increase() and rate() don't seem to assume a previous value of 0 when a vector gets a first value for a label. you would think that an increase() on a first-value mox_panic_total{"..."}=1 would return 1, and similar for rate(), but that doesn't appear to be the behaviour. so we just explicitly initialize the count to 0 for each possible label value. mox has more vector metrics, but panics feels like the most important, and it's too much code to initialize them all, for all combinations of label values. there is probably a better way that fixes this for all cases... --- ctl.go | 2 +- dkim/dkim.go | 2 +- imapserver/server.go | 2 +- import.go | 2 +- metrics/panic.go | 57 ++++++++++++++++++++++++++++++++++++++++-- mtastsdb/refresh.go | 2 +- queue/queue.go | 2 +- serve.go | 2 +- smtpclient/client.go | 2 +- smtpserver/server.go | 6 ++--- store/account.go | 2 +- webaccount/import.go | 4 +-- webadmin/admin.go | 2 +- webmail/eventwriter.go | 2 +- webmail/view.go | 6 ++--- webmail/webmail.go | 2 +- 16 files changed, 75 insertions(+), 22 deletions(-) diff --git a/ctl.go b/ctl.go index 1283c0f..01985ef 100644 --- a/ctl.go +++ b/ctl.go @@ -282,7 +282,7 @@ func servectl(ctx context.Context, log *mlog.Log, conn net.Conn, shutdown func() } log.Error("servectl panic", mlog.Field("err", x), mlog.Field("cmd", ctl.cmd)) debug.PrintStack() - metrics.PanicInc("ctl") + metrics.PanicInc(metrics.Ctl) }() defer conn.Close() diff --git a/dkim/dkim.go b/dkim/dkim.go index 18f3856..1b60278 100644 --- a/dkim/dkim.go +++ b/dkim/dkim.go @@ -41,7 +41,7 @@ var ( metricDKIMSign = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "mox_dkim_sign_total", - Help: "DKIM messages signings.", + Help: "DKIM messages signings, label key is the type of key, rsa or ed25519.", }, []string{ "key", diff --git a/imapserver/server.go b/imapserver/server.go index e675033..539b254 100644 --- a/imapserver/server.go +++ b/imapserver/server.go @@ -682,7 +682,7 @@ func serve(listenerName string, cid int64, tlsConfig *tls.Config, nc net.Conn, x } else { c.log.Error("unhandled panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("imapserver") + metrics.PanicInc(metrics.Imapserver) } }() diff --git a/import.go b/import.go index 8383349..fb0b170 100644 --- a/import.go +++ b/import.go @@ -256,7 +256,7 @@ func importctl(ctx context.Context, ctl *ctl, mbox bool) { if x != ctl.x { ctl.log.Error("import error", mlog.Field("panic", fmt.Errorf("%v", x))) debug.PrintStack() - metrics.PanicInc("import") + metrics.PanicInc(metrics.Import) } else { ctl.log.Error("import error") } diff --git a/metrics/panic.go b/metrics/panic.go index 45717de..106bebc 100644 --- a/metrics/panic.go +++ b/metrics/panic.go @@ -15,6 +15,59 @@ var metricPanic = promauto.NewCounterVec( }, ) -func PanicInc(pkg string) { - metricPanic.WithLabelValues(pkg).Inc() +type Panic string + +const ( + Ctl Panic = "ctl" + Import Panic = "import" + Serve Panic = "serve" + Imapserver Panic = "imapserver" + Mtastsdb Panic = "mtastsdb" + Queue Panic = "queue" + Smtpclient Panic = "smtpclient" + Smtpserver Panic = "smtpserver" + Dkimverify Panic = "dkimverify" + Spfverify Panic = "spfverify" + Upgradethreads Panic = "upgradethreads" + Importmanage Panic = "importmanage" + Importmessages Panic = "importmessages" + Webadmin Panic = "webadmin" + Webmailsendevent Panic = "webmailsendevent" + Webmail Panic = "webmail" + Webmailrequest Panic = "webmailrequest" + Webmailquery Panic = "webmailquery" + Webmailhandle Panic = "webmailhandle" +) + +func init() { + // Ensure the panic counts are initialized to 0, so the query for change also picks + // up the first panic. + names := []Panic{ + Ctl, + Import, + Serve, + Imapserver, + Mtastsdb, + Queue, + Smtpclient, + Smtpserver, + Dkimverify, + Spfverify, + Upgradethreads, + Importmanage, + Importmessages, + Webadmin, + Webmailsendevent, + Webmail, + Webmailrequest, + Webmailquery, + Webmailhandle, + } + for _, name := range names { + metricPanic.WithLabelValues(string(name)).Add(0) + } +} + +func PanicInc(name Panic) { + metricPanic.WithLabelValues(string(name)).Inc() } diff --git a/mtastsdb/refresh.go b/mtastsdb/refresh.go index 158e0b5..cb7a91d 100644 --- a/mtastsdb/refresh.go +++ b/mtastsdb/refresh.go @@ -112,7 +112,7 @@ func refreshDomain(ctx context.Context, db *bstore.DB, resolver dns.Resolver, pr // Should not happen, but make sure errors don't take down the application. log.Error("refresh1", mlog.Field("panic", x)) debug.PrintStack() - metrics.PanicInc("mtastsdb") + metrics.PanicInc(metrics.Mtastsdb) } }() diff --git a/queue/queue.go b/queue/queue.go index c72aaa6..fb5fb65 100644 --- a/queue/queue.go +++ b/queue/queue.go @@ -497,7 +497,7 @@ func deliver(resolver dns.Resolver, m Msg) { if x != nil { qlog.Error("deliver panic", mlog.Field("panic", x)) debug.PrintStack() - metrics.PanicInc("queue") + metrics.PanicInc(metrics.Queue) } }() diff --git a/serve.go b/serve.go index 52f2f54..77a33e0 100644 --- a/serve.go +++ b/serve.go @@ -44,7 +44,7 @@ func monitorDNSBL(log *mlog.Log) { if x != nil { log.Error("monitordnsbl panic", mlog.Field("panic", x)) debug.PrintStack() - metrics.PanicInc("serve") + metrics.PanicInc(metrics.Serve) } }() diff --git a/smtpclient/client.go b/smtpclient/client.go index 0328b53..3430350 100644 --- a/smtpclient/client.go +++ b/smtpclient/client.go @@ -457,7 +457,7 @@ func (c *Client) recover(rerr *error) { } cerr, ok := x.(Error) if !ok { - metrics.PanicInc("smtpclient") + metrics.PanicInc(metrics.Smtpclient) panic(x) } *rerr = cerr diff --git a/smtpserver/server.go b/smtpserver/server.go index 500337f..8008ff4 100644 --- a/smtpserver/server.go +++ b/smtpserver/server.go @@ -595,7 +595,7 @@ func serve(listenerName string, cid int64, hostname dns.Domain, tlsConfig *tls.C } else { c.log.Error("unhandled panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("smtpserver") + metrics.PanicInc(metrics.Smtpserver) } }() @@ -1903,7 +1903,7 @@ func (c *conn) deliver(ctx context.Context, recvHdrFor func(string) string, msgW if x != nil { c.log.Error("dkim verify panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("dkimverify") + metrics.PanicInc(metrics.Dkimverify) } }() defer wg.Done() @@ -1939,7 +1939,7 @@ func (c *conn) deliver(ctx context.Context, recvHdrFor func(string) string, msgW if x != nil { c.log.Error("spf verify panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("spfverify") + metrics.PanicInc(metrics.Spfverify) } }() defer wg.Done() diff --git a/store/account.go b/store/account.go index 960e479..0a0e327 100644 --- a/store/account.go +++ b/store/account.go @@ -855,7 +855,7 @@ func OpenAccountDB(accountDir, accountName string) (a *Account, rerr error) { if x != nil { xlog.Error("upgradeThreads panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("upgradeThreads") + metrics.PanicInc(metrics.Upgradethreads) acc.threadsErr = fmt.Errorf("panic during upgradeThreads: %v", x) } diff --git a/webaccount/import.go b/webaccount/import.go index b7489a4..c848273 100644 --- a/webaccount/import.go +++ b/webaccount/import.go @@ -69,7 +69,7 @@ func ImportManage() { if x := recover(); x != nil { log.Error("import manage panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("importmanage") + metrics.PanicInc(metrics.Importmanage) } }() @@ -342,7 +342,7 @@ func importMessages(ctx context.Context, log *mlog.Log, token string, acc *store } else { log.Error("import panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("importmessages") + metrics.PanicInc(metrics.Importmessages) } }() diff --git a/webadmin/admin.go b/webadmin/admin.go index e4c3b78..379daaa 100644 --- a/webadmin/admin.go +++ b/webadmin/admin.go @@ -367,7 +367,7 @@ func logPanic(ctx context.Context) { log := xlog.WithContext(ctx) log.Error("recover from panic", mlog.Field("panic", x)) debug.PrintStack() - metrics.PanicInc("webadmin") + metrics.PanicInc(metrics.Webadmin) } // return IPs we may be listening on. diff --git a/webmail/eventwriter.go b/webmail/eventwriter.go index 5058f1e..715bdcf 100644 --- a/webmail/eventwriter.go +++ b/webmail/eventwriter.go @@ -84,7 +84,7 @@ func (ew *eventWriter) xsendEvent(ctx context.Context, log *mlog.Log, name strin if x != nil { log.WithContext(ctx).Error("writeEvent panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("webmail-sendEvent") + metrics.PanicInc(metrics.Webmailsendevent) } }() diff --git a/webmail/view.go b/webmail/view.go index 1c40e61..42a3883 100644 --- a/webmail/view.go +++ b/webmail/view.go @@ -568,7 +568,7 @@ func serveEvents(ctx context.Context, log *mlog.Log, w http.ResponseWriter, r *h } else { log.WithContext(ctx).Error("serveEvents panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("webmail") + metrics.PanicInc(metrics.Webmail) panic(x) } }() @@ -1216,7 +1216,7 @@ func viewRequestTx(ctx context.Context, log *mlog.Log, acc *store.Account, tx *b if x != nil { log.WithContext(ctx).Error("viewRequestTx panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("webmail-request") + metrics.PanicInc(metrics.Webmailrequest) } }() @@ -1300,7 +1300,7 @@ func queryMessages(ctx context.Context, log *mlog.Log, acc *store.Account, tx *b log.WithContext(ctx).Error("queryMessages panic", mlog.Field("err", x)) debug.PrintStack() mrc <- msgResp{err: fmt.Errorf("query failed")} - metrics.PanicInc("webmail-query") + metrics.PanicInc(metrics.Webmailquery) } close(mrc) diff --git a/webmail/webmail.go b/webmail/webmail.go index 0f4e4a6..ccd86e4 100644 --- a/webmail/webmail.go +++ b/webmail/webmail.go @@ -381,7 +381,7 @@ func handle(apiHandler http.Handler, w http.ResponseWriter, r *http.Request) { if !ok { log.WithContext(ctx).Error("handle panic", mlog.Field("err", x)) debug.PrintStack() - metrics.PanicInc("webmail-handle") + metrics.PanicInc(metrics.Webmailhandle) panic(x) } if strings.HasPrefix(err.Code, "user:") {