initialize metric mox_panic_total with 0, so the alerting rule also catches the first panic for a label

increase() and rate() don't seem to assume a previous value of 0 when a vector
gets a first value for a label. you would think that an increase() on a
first-value mox_panic_total{"..."}=1 would return 1, and similar for rate(), but
that doesn't appear to be the behaviour. so we just explicitly initialize the
count to 0 for each possible label value. mox has more vector metrics, but
panics feels like the most important, and it's too much code to initialize them
all, for all combinations of label values. there is probably a better way that
fixes this for all cases...
This commit is contained in:
Mechiel Lukkien 2023-09-15 16:47:17 +02:00
parent af71e9855b
commit 3620d6f05e
No known key found for this signature in database
16 changed files with 75 additions and 22 deletions

2
ctl.go
View file

@ -282,7 +282,7 @@ func servectl(ctx context.Context, log *mlog.Log, conn net.Conn, shutdown func()
}
log.Error("servectl panic", mlog.Field("err", x), mlog.Field("cmd", ctl.cmd))
debug.PrintStack()
metrics.PanicInc("ctl")
metrics.PanicInc(metrics.Ctl)
}()
defer conn.Close()

View file

@ -41,7 +41,7 @@ var (
metricDKIMSign = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "mox_dkim_sign_total",
Help: "DKIM messages signings.",
Help: "DKIM messages signings, label key is the type of key, rsa or ed25519.",
},
[]string{
"key",

View file

@ -682,7 +682,7 @@ func serve(listenerName string, cid int64, tlsConfig *tls.Config, nc net.Conn, x
} else {
c.log.Error("unhandled panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("imapserver")
metrics.PanicInc(metrics.Imapserver)
}
}()

View file

@ -256,7 +256,7 @@ func importctl(ctx context.Context, ctl *ctl, mbox bool) {
if x != ctl.x {
ctl.log.Error("import error", mlog.Field("panic", fmt.Errorf("%v", x)))
debug.PrintStack()
metrics.PanicInc("import")
metrics.PanicInc(metrics.Import)
} else {
ctl.log.Error("import error")
}

View file

@ -15,6 +15,59 @@ var metricPanic = promauto.NewCounterVec(
},
)
func PanicInc(pkg string) {
metricPanic.WithLabelValues(pkg).Inc()
type Panic string
const (
Ctl Panic = "ctl"
Import Panic = "import"
Serve Panic = "serve"
Imapserver Panic = "imapserver"
Mtastsdb Panic = "mtastsdb"
Queue Panic = "queue"
Smtpclient Panic = "smtpclient"
Smtpserver Panic = "smtpserver"
Dkimverify Panic = "dkimverify"
Spfverify Panic = "spfverify"
Upgradethreads Panic = "upgradethreads"
Importmanage Panic = "importmanage"
Importmessages Panic = "importmessages"
Webadmin Panic = "webadmin"
Webmailsendevent Panic = "webmailsendevent"
Webmail Panic = "webmail"
Webmailrequest Panic = "webmailrequest"
Webmailquery Panic = "webmailquery"
Webmailhandle Panic = "webmailhandle"
)
func init() {
// Ensure the panic counts are initialized to 0, so the query for change also picks
// up the first panic.
names := []Panic{
Ctl,
Import,
Serve,
Imapserver,
Mtastsdb,
Queue,
Smtpclient,
Smtpserver,
Dkimverify,
Spfverify,
Upgradethreads,
Importmanage,
Importmessages,
Webadmin,
Webmailsendevent,
Webmail,
Webmailrequest,
Webmailquery,
Webmailhandle,
}
for _, name := range names {
metricPanic.WithLabelValues(string(name)).Add(0)
}
}
func PanicInc(name Panic) {
metricPanic.WithLabelValues(string(name)).Inc()
}

View file

@ -112,7 +112,7 @@ func refreshDomain(ctx context.Context, db *bstore.DB, resolver dns.Resolver, pr
// Should not happen, but make sure errors don't take down the application.
log.Error("refresh1", mlog.Field("panic", x))
debug.PrintStack()
metrics.PanicInc("mtastsdb")
metrics.PanicInc(metrics.Mtastsdb)
}
}()

View file

@ -497,7 +497,7 @@ func deliver(resolver dns.Resolver, m Msg) {
if x != nil {
qlog.Error("deliver panic", mlog.Field("panic", x))
debug.PrintStack()
metrics.PanicInc("queue")
metrics.PanicInc(metrics.Queue)
}
}()

View file

@ -44,7 +44,7 @@ func monitorDNSBL(log *mlog.Log) {
if x != nil {
log.Error("monitordnsbl panic", mlog.Field("panic", x))
debug.PrintStack()
metrics.PanicInc("serve")
metrics.PanicInc(metrics.Serve)
}
}()

View file

@ -457,7 +457,7 @@ func (c *Client) recover(rerr *error) {
}
cerr, ok := x.(Error)
if !ok {
metrics.PanicInc("smtpclient")
metrics.PanicInc(metrics.Smtpclient)
panic(x)
}
*rerr = cerr

View file

@ -595,7 +595,7 @@ func serve(listenerName string, cid int64, hostname dns.Domain, tlsConfig *tls.C
} else {
c.log.Error("unhandled panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("smtpserver")
metrics.PanicInc(metrics.Smtpserver)
}
}()
@ -1903,7 +1903,7 @@ func (c *conn) deliver(ctx context.Context, recvHdrFor func(string) string, msgW
if x != nil {
c.log.Error("dkim verify panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("dkimverify")
metrics.PanicInc(metrics.Dkimverify)
}
}()
defer wg.Done()
@ -1939,7 +1939,7 @@ func (c *conn) deliver(ctx context.Context, recvHdrFor func(string) string, msgW
if x != nil {
c.log.Error("spf verify panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("spfverify")
metrics.PanicInc(metrics.Spfverify)
}
}()
defer wg.Done()

View file

@ -855,7 +855,7 @@ func OpenAccountDB(accountDir, accountName string) (a *Account, rerr error) {
if x != nil {
xlog.Error("upgradeThreads panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("upgradeThreads")
metrics.PanicInc(metrics.Upgradethreads)
acc.threadsErr = fmt.Errorf("panic during upgradeThreads: %v", x)
}

View file

@ -69,7 +69,7 @@ func ImportManage() {
if x := recover(); x != nil {
log.Error("import manage panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("importmanage")
metrics.PanicInc(metrics.Importmanage)
}
}()
@ -342,7 +342,7 @@ func importMessages(ctx context.Context, log *mlog.Log, token string, acc *store
} else {
log.Error("import panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("importmessages")
metrics.PanicInc(metrics.Importmessages)
}
}()

View file

@ -367,7 +367,7 @@ func logPanic(ctx context.Context) {
log := xlog.WithContext(ctx)
log.Error("recover from panic", mlog.Field("panic", x))
debug.PrintStack()
metrics.PanicInc("webadmin")
metrics.PanicInc(metrics.Webadmin)
}
// return IPs we may be listening on.

View file

@ -84,7 +84,7 @@ func (ew *eventWriter) xsendEvent(ctx context.Context, log *mlog.Log, name strin
if x != nil {
log.WithContext(ctx).Error("writeEvent panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("webmail-sendEvent")
metrics.PanicInc(metrics.Webmailsendevent)
}
}()

View file

@ -568,7 +568,7 @@ func serveEvents(ctx context.Context, log *mlog.Log, w http.ResponseWriter, r *h
} else {
log.WithContext(ctx).Error("serveEvents panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("webmail")
metrics.PanicInc(metrics.Webmail)
panic(x)
}
}()
@ -1216,7 +1216,7 @@ func viewRequestTx(ctx context.Context, log *mlog.Log, acc *store.Account, tx *b
if x != nil {
log.WithContext(ctx).Error("viewRequestTx panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("webmail-request")
metrics.PanicInc(metrics.Webmailrequest)
}
}()
@ -1300,7 +1300,7 @@ func queryMessages(ctx context.Context, log *mlog.Log, acc *store.Account, tx *b
log.WithContext(ctx).Error("queryMessages panic", mlog.Field("err", x))
debug.PrintStack()
mrc <- msgResp{err: fmt.Errorf("query failed")}
metrics.PanicInc("webmail-query")
metrics.PanicInc(metrics.Webmailquery)
}
close(mrc)

View file

@ -381,7 +381,7 @@ func handle(apiHandler http.Handler, w http.ResponseWriter, r *http.Request) {
if !ok {
log.WithContext(ctx).Error("handle panic", mlog.Field("err", x))
debug.PrintStack()
metrics.PanicInc("webmail-handle")
metrics.PanicInc(metrics.Webmailhandle)
panic(x)
}
if strings.HasPrefix(err.Code, "user:") {