From 3de6642b3a4778b31f90aaaad6761a46d6e79e90 Mon Sep 17 00:00:00 2001 From: Mechiel Lukkien Date: Mon, 13 Feb 2023 18:04:05 +0100 Subject: [PATCH] implement exporting of all mailboxes/messages as zip/tgz of mbox/maildir --- doc.go | 6 +- export.go | 6 +- http/account.go | 54 +++++- http/account.html | 9 + store/export.go | 383 +++++++++++++++++++++++++++++++++++++++++++ store/export_test.go | 96 +++++++++++ 6 files changed, 547 insertions(+), 7 deletions(-) create mode 100644 store/export.go create mode 100644 store/export_test.go diff --git a/doc.go b/doc.go index e87a777..3a94335 100644 --- a/doc.go +++ b/doc.go @@ -263,7 +263,8 @@ Export one or all mailboxes from an account in maildir format. Export bypasses a running mox instance. It opens the account mailbox/message database file directly. This may block if a running mox instance also has the -database open, e.g. for IMAP connections. +database open, e.g. for IMAP connections. To export from a running instance, use +the accounts web page. usage: mox export maildir dst-path account-path [mailbox] @@ -275,7 +276,8 @@ Using mbox is not recommended. Maildir is a better format. Export bypasses a running mox instance. It opens the account mailbox/message database file directly. This may block if a running mox instance also has the -database open, e.g. for IMAP connections. +database open, e.g. for IMAP connections. To export from a running instance, use +the accounts web page. For mbox export, we use "mboxrd" where message lines starting with the magic "From " string are escaped by prepending a >. We escape all ">*From ", diff --git a/export.go b/export.go index d3215a3..8795378 100644 --- a/export.go +++ b/export.go @@ -22,7 +22,8 @@ func cmdExportMaildir(c *cmd) { Export bypasses a running mox instance. It opens the account mailbox/message database file directly. This may block if a running mox instance also has the -database open, e.g. for IMAP connections. +database open, e.g. for IMAP connections. To export from a running instance, use +the accounts web page. ` args := c.Parse() xcmdExport(false, args, c) @@ -36,7 +37,8 @@ Using mbox is not recommended. Maildir is a better format. Export bypasses a running mox instance. It opens the account mailbox/message database file directly. This may block if a running mox instance also has the -database open, e.g. for IMAP connections. +database open, e.g. for IMAP connections. To export from a running instance, use +the accounts web page. For mbox export, we use "mboxrd" where message lines starting with the magic "From " string are escaped by prepending a >. We escape all ">*From ", diff --git a/http/account.go b/http/account.go index a739ebf..b686a9d 100644 --- a/http/account.go +++ b/http/account.go @@ -1,6 +1,9 @@ package http import ( + "archive/tar" + "archive/zip" + "compress/gzip" "context" "encoding/base64" "errors" @@ -109,7 +112,12 @@ func accountHandle(w http.ResponseWriter, r *http.Request) { return } - if r.Method == "GET" && r.URL.Path == "/" { + switch r.URL.Path { + case "/": + if r.Method != "GET" { + http.Error(w, "405 - method not allowed - post required", http.StatusMethodNotAllowed) + return + } w.Header().Set("Content-Type", "text/html; charset=utf-8") w.Header().Set("Cache-Control", "no-cache; max-age=0") // We typically return the embedded admin.html, but during development it's handy @@ -121,9 +129,49 @@ func accountHandle(w http.ResponseWriter, r *http.Request) { } else { w.Write(accountHTML) } - return + + case "/mail-export-maildir.tgz", "/mail-export-maildir.zip", "/mail-export-mbox.tgz", "/mail-export-mbox.zip": + maildir := strings.Contains(r.URL.Path, "maildir") + tgz := strings.Contains(r.URL.Path, ".tgz") + + acc, err := store.OpenAccount(accName) + if err != nil { + log.Errorx("open account for export", err) + http.Error(w, "500 - internal server error", http.StatusInternalServerError) + return + } + defer acc.Close() + + var archiver store.Archiver + if tgz { + // Don't tempt browsers to "helpfully" decompress. + w.Header().Set("Content-Type", "application/octet-stream") + + gzw := gzip.NewWriter(w) + defer func() { + gzw.Close() + }() + archiver = store.TarArchiver{Writer: tar.NewWriter(gzw)} + } else { + w.Header().Set("Content-Type", "application/zip") + archiver = store.ZipArchiver{Writer: zip.NewWriter(w)} + } + defer func() { + if err := archiver.Close(); err != nil { + log.Errorx("exporting mail close", err) + } + }() + if err := acc.ExportMessages(log, archiver, maildir, ""); err != nil { + log.Errorx("exporting mail", err) + } + + default: + if strings.HasPrefix(r.URL.Path, "/api/") { + accountSherpaHandler.ServeHTTP(w, r.WithContext(context.WithValue(ctx, authCtxKey, accName))) + return + } + http.NotFound(w, r) } - accountSherpaHandler.ServeHTTP(w, r.WithContext(context.WithValue(ctx, authCtxKey, accName))) } type ctxKey string diff --git a/http/account.html b/http/account.html index 3a6ba17..3b2c0e0 100644 --- a/http/account.html +++ b/http/account.html @@ -195,6 +195,15 @@ const index = async () => { } }, ), + dom.br(), + dom.h2('Export'), + dom.p('Export all messages in all mailboxes. Either in maildir format (with flags like Replied, Forwarded, Junk, etc) or in mbox format (without flags). And either as .zip file or .tgz file.'), + dom.ul( + dom.li(dom.a('mail-export-maildir.tgz', attr({href: 'mail-export-maildir.tgz'}))), + dom.li(dom.a('mail-export-maildir.zip', attr({href: 'mail-export-maildir.zip'}))), + dom.li(dom.a('mail-export-mbox.tgz', attr({href: 'mail-export-mbox.tgz'}))), + dom.li(dom.a('mail-export-mbox.zip', attr({href: 'mail-export-mbox.zip'}))), + ), footer, ) } diff --git a/store/export.go b/store/export.go new file mode 100644 index 0000000..2bbd7c0 --- /dev/null +++ b/store/export.go @@ -0,0 +1,383 @@ +package store + +import ( + "archive/tar" + "archive/zip" + "bufio" + "bytes" + "fmt" + "io" + "os" + "path/filepath" + "sort" + "time" + + "github.com/mjl-/bstore" + + "github.com/mjl-/mox/mlog" +) + +// Archiver can archive multiple mailboxes and their messages. +type Archiver interface { + Create(name string, size int64, mtime time.Time) (io.Writer, error) + Close() error +} + +// TarArchiver is an Archiver that writes to a tar ifle. +type TarArchiver struct { + *tar.Writer +} + +// Create adds a file header to the tar file. +func (a TarArchiver) Create(name string, size int64, mtime time.Time) (io.Writer, error) { + hdr := tar.Header{ + Name: name, + Size: size, + Mode: 0600, + ModTime: mtime, + Format: tar.FormatPAX, + } + if err := a.WriteHeader(&hdr); err != nil { + return nil, err + } + return a, nil +} + +// ZipArchiver is an Archiver that writes to a zip file. +type ZipArchiver struct { + *zip.Writer +} + +// Create adds a file header to the zip file. +func (a ZipArchiver) Create(name string, size int64, mtime time.Time) (io.Writer, error) { + hdr := zip.FileHeader{ + Name: name, + Method: zip.Deflate, + Modified: mtime, + UncompressedSize64: uint64(size), + } + return a.CreateHeader(&hdr) +} + +// ExportMessages writes messages to archiver. Either in maildir format, or otherwise in +// mbox. If mailboxOpt is empty, all mailboxes are exported, otherwise only the +// named mailbox. +// +// Some errors are not fatal and result in skipped messages. In that happens, a +// file "errors.txt" is added to the archive describing the errors. The goal is to +// let users export (hopefully) most messages even in the face of errors. +func (a *Account) ExportMessages(log *mlog.Log, archiver Archiver, maildir bool, mailboxOpt string) error { + // Start transaction without closure, we are going to close it early, but don't + // want to deal with declaring many variables now to be able to assign them in a + // closure and use them afterwards. + tx, err := a.DB.Begin(false) + if err != nil { + return fmt.Errorf("transaction: %v", err) + } + defer func() { + if tx != nil { + tx.Rollback() + } + }() + + start := time.Now() + + // Set up mailbox names and ids. + id2name := map[int64]string{} + name2id := map[string]int64{} + + mailboxes, err := bstore.QueryTx[Mailbox](tx).List() + xcheckf(err, "query mailboxes") + for _, mb := range mailboxes { + id2name[mb.ID] = mb.Name + name2id[mb.Name] = mb.ID + } + + var mailboxID int64 + if mailboxOpt != "" { + var ok bool + mailboxID, ok = name2id[mailboxOpt] + if !ok { + return fmt.Errorf("mailbox not found") + } + } + + var names []string + for _, name := range id2name { + if mailboxOpt != "" && name != mailboxOpt { + continue + } + names = append(names, name) + } + // We need to sort the names because maildirs can create subdirs. Ranging over + // id2name directly would randomize the directory names, we would create a sub + // maildir before the parent, and fail with "dir exists" when creating the parent + // dir. + sort.Slice(names, func(i, j int) bool { + return names[i] < names[j] + }) + + mailboxOrder := map[int64]int{} + for i, name := range names { + mbID := name2id[name] + mailboxOrder[mbID] = i + } + + // Fetch all messages. This can take quite a bit of memory if the mailbox is large. + q := bstore.QueryTx[Message](tx) + if mailboxID > 0 { + q.FilterNonzero(Message{MailboxID: mailboxID}) + } + msgs, err := q.List() + if err != nil { + return fmt.Errorf("listing messages: %v", err) + } + + // Close transaction. We don't want to hold it for too long. We are now at risk + // that a message is be removed while we export, or flags changed. At least the + // size won't change. If we cannot open the message later on, we'll skip it and add + // an error message to an errors.txt file in the output archive. + if err := tx.Rollback(); err != nil { + return fmt.Errorf("closing transaction: %v", err) + } + tx = nil + + // Order the messages by mailbox, received time and finally message ID. + sort.Slice(msgs, func(i, j int) bool { + iid := msgs[i].MailboxID + jid := msgs[j].MailboxID + if iid != jid { + return mailboxOrder[iid] < mailboxOrder[jid] + } + t := msgs[i].Received.Compare(msgs[j].Received) + if t != 0 { + return t < 0 + } + return msgs[i].ID < msgs[j].ID + }) + + // We keep track of errors reading message files. We continue exporting and add an + // errors.txt file to the archive. In case of errors, the user can get (hopefully) + // most of their emails, and see something went wrong. For other errors, like + // writing to the archiver (e.g. a browser), we abort, because we don't want to + // continue with useless work. + var errors string + + var curMailboxID int64 // Used to set curMailbox and finish a previous mbox file. + var curMailbox string + + var mboxtmp *os.File + var mboxwriter *bufio.Writer + defer func() { + if mboxtmp != nil { + mboxtmp.Close() + } + }() + + finishMbox := func() error { + if mboxtmp == nil { + return nil + } + + if err := mboxwriter.Flush(); err != nil { + return fmt.Errorf("flush mbox writer: %v", err) + } + fi, err := mboxtmp.Stat() + if err != nil { + return fmt.Errorf("stat temporary mbox file: %v", err) + } + if _, err := mboxtmp.Seek(0, 0); err != nil { + return fmt.Errorf("seek to start of temporary mbox file") + } + w, err := archiver.Create(curMailbox+".mbox", fi.Size(), fi.ModTime()) + if err != nil { + return fmt.Errorf("add mbox to archive: %v", err) + } + if _, err := io.Copy(w, mboxtmp); err != nil { + return fmt.Errorf("copying temp mbox file to archive: %v", err) + } + if err := mboxtmp.Close(); err != nil { + log.Errorx("closing temporary mbox file", err) + // Continue, not fatal. + } + mboxwriter = nil + mboxtmp = nil + return nil + } + + exportMessage := func(m Message) error { + mp := a.MessagePath(m.ID) + var mr io.ReadCloser + if m.Size == int64(len(m.MsgPrefix)) { + mr = io.NopCloser(bytes.NewReader(m.MsgPrefix)) + } else { + mpf, err := os.Open(mp) + if err != nil { + errors += fmt.Sprintf("open message file for id %d, path %s: %v (message skipped)\n", m.ID, mp, err) + return nil + } + defer mpf.Close() + st, err := mpf.Stat() + if err != nil { + errors += fmt.Sprintf("stat message file for id %d, path %s: %v (message skipped)\n", m.ID, mp, err) + return nil + } + size := st.Size() + int64(len(m.MsgPrefix)) + if size != m.Size { + errors += fmt.Sprintf("message size mismatch for message id %d, database has %d, size is %d+%d=%d, using calculated size\n", m.ID, m.Size, len(m.MsgPrefix), st.Size(), size) + } + mr = FileMsgReader(m.MsgPrefix, mpf) + } + + if maildir { + p := curMailbox + if m.Flags.Seen { + p = filepath.Join(p, "cur") + } else { + p = filepath.Join(p, "new") + } + name := fmt.Sprintf("%d.%d.mox:2,", m.Received.Unix(), m.ID) + // todo: more flags? forwarded, (non)junk, phishing, mdnsent would be nice. but what is the convention. dovecot-keywords sounds non-standard. + if m.Flags.Seen { + name += "S" + } + if m.Flags.Answered { + name += "R" + } + if m.Flags.Flagged { + name += "F" + } + if m.Flags.Draft { + name += "D" + } + p = filepath.Join(p, name) + + // We store messages with \r\n, maildir needs without. But we need to know the + // final size. So first convert, then create file with size, and write from buffer. + // todo: for large messages, we should go through a temporary file instead of memory. + var dst bytes.Buffer + r := bufio.NewReader(mr) + for { + line, rerr := r.ReadBytes('\n') + if rerr != io.EOF && rerr != nil { + errors += fmt.Sprintf("reading from message for id %d: %v (message skipped)\n", m.ID, err) + return nil + } + if len(line) > 0 { + if bytes.HasSuffix(line, []byte("\r\n")) { + line = line[:len(line)-1] + line[len(line)-1] = '\n' + } + if _, err = dst.Write(line); err != nil { + return fmt.Errorf("writing message: %v", err) + } + } + if rerr == io.EOF { + break + } + } + size := int64(dst.Len()) + w, err := archiver.Create(p, size, m.Received) + if err != nil { + return fmt.Errorf("adding message to archive: %v", err) + } + if _, err := io.Copy(w, &dst); err != nil { + return fmt.Errorf("copying message to archive: %v", err) + } + return nil + } + + // todo: should we put status flags in Status or X-Status header inside the message? + // todo: should we do anything with Content-Length headers? changing the escaping could invalidate those. is anything checking that field? + mailfrom := "mox" + if m.MailFrom != "" { + mailfrom = m.MailFrom + } + if _, err := fmt.Fprintf(mboxwriter, "From %s %s\n", mailfrom, m.Received.Format(time.ANSIC)); err != nil { + return fmt.Errorf("write message line to mbox temp file: %v", err) + } + r := bufio.NewReader(mr) + for { + line, rerr := r.ReadBytes('\n') + if rerr != io.EOF && rerr != nil { + return fmt.Errorf("reading message: %v", err) + } + if len(line) > 0 { + if bytes.HasSuffix(line, []byte("\r\n")) { + line = line[:len(line)-1] + line[len(line)-1] = '\n' + } + if bytes.HasPrefix(bytes.TrimLeft(line, ">"), []byte("From ")) { + if _, err := fmt.Fprint(mboxwriter, ">"); err != nil { + return fmt.Errorf("writing escaping >: %v", err) + } + } + if _, err := mboxwriter.Write(line); err != nil { + return fmt.Errorf("writing line: %v", err) + } + } + if rerr == io.EOF { + break + } + } + if _, err := fmt.Fprint(mboxwriter, "\n"); err != nil { + return fmt.Errorf("writing end of message newline: %v", err) + } + return nil + } + + for _, m := range msgs { + if m.MailboxID != curMailboxID { + if err := finishMbox(); err != nil { + return err + } + + curMailbox = id2name[m.MailboxID] + curMailboxID = m.MailboxID + if maildir { + // Create the directories that show this is a maildir. + if _, err := archiver.Create(curMailbox+"/new/", 0, start); err != nil { + return fmt.Errorf("adding maildir new directory: %v", err) + } + if _, err := archiver.Create(curMailbox+"/cur/", 0, start); err != nil { + return fmt.Errorf("adding maildir cur directory: %v", err) + } + if _, err := archiver.Create(curMailbox+"/tmp/", 0, start); err != nil { + return fmt.Errorf("adding maildir tmp directory: %v", err) + } + } else { + + mboxtmp, err = os.CreateTemp("", "mox-mail-export-mbox") + if err != nil { + return fmt.Errorf("creating temp mbox file: %v", err) + } + // Remove file immediately, so we are sure we don't leave it around. + if err := os.Remove(mboxtmp.Name()); err != nil { + return fmt.Errorf("removing temp file just created: %v", err) + } + mboxwriter = bufio.NewWriter(mboxtmp) + } + } + + if err := exportMessage(m); err != nil { + return err + } + } + if err := finishMbox(); err != nil { + return err + } + + if errors != "" { + w, err := archiver.Create("errors.txt", int64(len(errors)), time.Now()) + if err != nil { + log.Errorx("adding errors.txt to archive", err) + return err + } + if _, err := w.Write([]byte(errors)); err != nil { + log.Errorx("writing errors.txt to archive", err) + return err + } + } + + return nil +} diff --git a/store/export_test.go b/store/export_test.go new file mode 100644 index 0000000..da9d647 --- /dev/null +++ b/store/export_test.go @@ -0,0 +1,96 @@ +package store + +import ( + "archive/tar" + "archive/zip" + "bytes" + "io" + "os" + "testing" + "time" + + "github.com/mjl-/mox/mlog" + "github.com/mjl-/mox/mox-" +) + +func TestExport(t *testing.T) { + // Set up an account, add 2 messages to different 2 mailboxes. export as tar/zip + // and maildir/mbox. check there are 2 files in the repo, no errors.txt. + + os.RemoveAll("../testdata/store/data") + mox.ConfigStaticPath = "../testdata/store/mox.conf" + mox.MustLoadConfig() + acc, err := OpenAccount("mjl") + tcheck(t, err, "open account") + defer acc.Close() + switchDone := Switchboard() + defer close(switchDone) + + log := mlog.New("export") + + msgFile, err := os.CreateTemp("", "mox-test-export") + tcheck(t, err, "create temp") + defer os.Remove(msgFile.Name()) // To be sure. + const msg = "test: test\r\n\r\ntest\r\n" + _, err = msgFile.Write([]byte(msg)) + tcheck(t, err, "write message") + + m := Message{Received: time.Now(), Size: int64(len(msg))} + err = acc.DeliverMailbox(xlog, "Inbox", &m, msgFile, false) + tcheck(t, err, "deliver") + + m = Message{Received: time.Now(), Size: int64(len(msg))} + err = acc.DeliverMailbox(xlog, "Trash", &m, msgFile, true) + tcheck(t, err, "deliver") + + var maildirZip, maildirTar, mboxZip, mboxTar bytes.Buffer + + archive := func(archiver Archiver, maildir bool) { + t.Helper() + err = acc.ExportMessages(log, archiver, maildir, "") + tcheck(t, err, "export messages") + err = archiver.Close() + tcheck(t, err, "archiver close") + } + + archive(ZipArchiver{zip.NewWriter(&maildirZip)}, true) + archive(ZipArchiver{zip.NewWriter(&mboxZip)}, false) + archive(TarArchiver{tar.NewWriter(&maildirTar)}, true) + archive(TarArchiver{tar.NewWriter(&mboxTar)}, false) + + if r, err := zip.NewReader(bytes.NewReader(maildirZip.Bytes()), int64(maildirZip.Len())); err != nil { + t.Fatalf("reading maildir zip: %v", err) + } else if len(r.File) != 2*3+2 { + t.Fatalf("maildir zip, expected 2*3 dirs, and 2 files, got %d files", len(r.File)) + } + + if r, err := zip.NewReader(bytes.NewReader(mboxZip.Bytes()), int64(mboxZip.Len())); err != nil { + t.Fatalf("reading mbox zip: %v", err) + } else if len(r.File) != 2 { + t.Fatalf("maildir zip, 2 files, got %d files", len(r.File)) + } + + checkTarFiles := func(r io.Reader, n int) { + t.Helper() + tr := tar.NewReader(r) + have := 0 + for { + h, err := tr.Next() + if err == io.EOF { + break + } + have++ + if h.Name == "errors.txt" { + t.Fatalf("got errors.txt") + } + _, err = io.Copy(io.Discard, tr) + tcheck(t, err, "copy") + } + if n != have { + t.Fatalf("got %d files, expected %d", n, have) + } + } + + checkTarFiles(&maildirTar, 2*3+2) + checkTarFiles(&mboxTar, 2) +}