~emersion/public-inbox

go-maildir: Improve performance with large maildirs v1 PROPOSED

I have a maildir with around 50k messages in it. When I open it in aerc,
the performance is very poor. I discovered that this is because in such
large directories, the Glob function takes a while to execute. I've
spent quite some time digging through the Go runtime trying to profile
this and discover why it is so slow, but unfortunately I've not been
able to figure it out. There seems to be some sort of resource
contention surrounding the getdents64 syscall, but I have not been able
to precisely identify it. In any case, I think the answer may simply be
that globbing is inherently expensive with lots of files.

To remedy this, I've added a heuristic approach to the Filename(key)
function where, before calling Glob, we just try to guess the filename
based on some common flags and os.Stat our guesses. The benchmarks with
5k messages show dramatic improvement, and I've recorded a before/after
asciicast to show how this looks in aerc as well:

Before: https://asciinema.org/a/257969
After: https://asciinema.org/a/257970

Ben Burwell (2):
  Add Filename benchmark
  Guess likely filename before resorting to Glob

 maildir.go      | 31 +++++++++++++++++
 maildir_test.go | 92 +++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 109 insertions(+), 14 deletions(-)

-- 
2.22.0
Excellent! I have nothing to say, those patches look very good.

To github.com:emersion/go-maildir.git
   2ed358496c11..c01dc1254320  master -> master

Thanks!
Export patchset (mbox)
How do I use this?

Copy & paste the following snippet into your terminal to import this patchset into git:

curl -s https://lists.sr.ht/~emersion/public-inbox/patches/6980/mbox | git am -3
Learn more about email & git

[PATCH go-maildir 1/2] Add Filename benchmark Export this patch

This benchmark helps understand how expensive the Filename lookup is
when working with a moderately large maildir (5k messages).
---
 maildir_test.go | 92 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 78 insertions(+), 14 deletions(-)

diff --git a/maildir_test.go b/maildir_test.go
index 2fe7672..3164df3 100644
--- a/maildir_test.go
+++ b/maildir_test.go
@@ -1,16 +1,18 @@
 package maildir
 
 import (
+	"fmt"
 	"io/ioutil"
+	"math/rand"
 	"os"
 	"testing"
 )
 
 // cleanup removes a Dir's directory structure
-func cleanup(t *testing.T, d Dir) {
+func cleanup(errorfn func(...interface{}), d Dir) {
 	err := os.RemoveAll(string(d))
 	if err != nil {
-		t.Error(err)
+		errorfn(err)
 	}
 }
 
@@ -41,18 +43,18 @@ func cat(t *testing.T, path string) string {
 }
 
 // makeDelivery creates a new message
-func makeDelivery(t *testing.T, d Dir, msg string) {
+func makeDelivery(fatalfn func(...interface{}), d Dir, msg string) {
 	del, err := d.NewDelivery()
 	if err != nil {
-		t.Fatal(err)
+		fatalfn(err)
 	}
 	_, err = del.Write([]byte(msg))
 	if err != nil {
-		t.Fatal(err)
+		fatalfn(err)
 	}
 	err = del.Close()
 	if err != nil {
-		t.Fatal(err)
+		fatalfn(err)
 	}
 }
 
@@ -97,7 +99,7 @@ func TestCreate(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	defer cleanup(t, d)
+	defer cleanup(t.Error, d)
 }
 
 func TestDelivery(t *testing.T) {
@@ -108,10 +110,10 @@ func TestDelivery(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer cleanup(t, d)
+	defer cleanup(t.Error, d)
 
 	var msg = "this is a message"
-	makeDelivery(t, d, msg)
+	makeDelivery(t.Fatal, d, msg)
 
 	keys, err := d.Unseen()
 	if err != nil {
@@ -138,9 +140,9 @@ func TestPurge(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer cleanup(t, d)
+	defer cleanup(t.Error, d)
 
-	makeDelivery(t, d, "foo")
+	makeDelivery(t.Fatal, d, "foo")
 
 	keys, err := d.Unseen()
 	if err != nil {
@@ -168,16 +170,16 @@ func TestMove(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer cleanup(t, d1)
+	defer cleanup(t.Error, d1)
 	var d2 Dir = "test_move2"
 	err = d2.Create()
 	if err != nil {
 		t.Fatal(err)
 	}
-	defer cleanup(t, d2)
+	defer cleanup(t.Error, d2)
 
 	const msg = "a moving message"
-	makeDelivery(t, d1, msg)
+	makeDelivery(t.Fatal, d1, msg)
 	keys, err := d1.Unseen()
 	if err != nil {
 		t.Fatal(err)
@@ -200,3 +202,65 @@ func TestMove(t *testing.T) {
 	}
 
 }
+
+func BenchmarkFilename(b *testing.B) {
+	// set up test maildir
+	d := Dir("benchmark_filename")
+	if err := d.Create(); err != nil {
+		b.Fatalf("could not set up benchmark: %v", err)
+	}
+	defer cleanup(b.Error, d)
+
+	// make 5000 deliveries
+	for i := 0; i < 5000; i++ {
+		makeDelivery(b.Fatal, d, fmt.Sprintf("here is message number %d", i))
+	}
+
+	// grab keys
+	keys, err := d.Unseen()
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	// shuffle keys
+	rand.Shuffle(len(keys), func(i, j int) {
+		keys[i], keys[j] = keys[j], keys[i]
+	})
+
+	// set some flags
+	for i, key := range keys {
+		var err error
+		switch i % 5 {
+		case 0:
+			// no flags
+			fallthrough
+		case 1:
+			err = d.SetFlags(key, []Flag{FlagSeen})
+		case 2:
+			err = d.SetFlags(key, []Flag{FlagSeen, FlagReplied})
+		case 3:
+			err = d.SetFlags(key, []Flag{FlagReplied})
+		case 4:
+			err = d.SetFlags(key, []Flag{FlagFlagged})
+		}
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+
+	// run benchmark for the first N shuffled keys
+	keyIdx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		b.StartTimer()
+		_, err := d.Filename(keys[keyIdx])
+		b.StopTimer()
+		if err != nil {
+			b.Errorf("could not get filename for key %s", keys[keyIdx])
+		}
+		keyIdx++
+		if keyIdx >= len(keys) {
+			keyIdx = 0
+		}
+	}
+}
-- 
2.22.0

[PATCH go-maildir 2/2] Guess likely filename before resorting to Glob Export this patch

Before, when we needed to look up the filename for a key in
the maildir (as we do before just about anything else), we used a call
to path/filepath.Glob to find a file which matches the key plus whatever
flags may be present.

However, when working with large maildirs, globbing becomes quite an
expensive operation as evidenced in the benchmark with 5000 files:

    $ go test -bench=Filename
    goos: darwin
    goarch: amd64
    pkg: github.com/emersion/go-maildir
    BenchmarkFilename-8          200           7317082 ns/op
    PASS
    ok      github.com/emersion/go-maildir  124.146s

This patch implements a heuristic approach to guess
the likely file path based on common sets of flags key before resorting
to the more expensive call to Glob, resulting in a dramatic performance
improvement:

    $ go test -bench=Filename
    goos: darwin
    goarch: amd64
    pkg: github.com/emersion/go-maildir
    BenchmarkFilename-8        50000             22797 ns/op
    PASS
    ok      github.com/emersion/go-maildir  15.898s

The time to find the filename has reduced from 7.32 milliseconds to
0.023 milliseconds.
---
 maildir.go | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/maildir.go b/maildir.go
index fe0adba..8cab16d 100644
--- a/maildir.go
+++ b/maildir.go
@@ -133,8 +133,39 @@ func (d Dir) Keys() ([]string, error) {
 	return keys, nil
 }
 
+func (d Dir) filenameGuesses(key string) []string {
+	basename := filepath.Join(string(d), "cur", key+string(separator)+"2,")
+	return []string{
+		basename,
+
+		basename + string(FlagPassed),
+		basename + string(FlagReplied),
+		basename + string(FlagSeen),
+		basename + string(FlagDraft),
+		basename + string(FlagFlagged),
+
+		basename + string(FlagPassed),
+		basename + string(FlagPassed) + string(FlagSeen),
+		basename + string(FlagPassed) + string(FlagSeen) + string(FlagFlagged),
+		basename + string(FlagPassed) + string(FlagFlagged),
+
+		basename + string(FlagReplied) + string(FlagSeen),
+		basename + string(FlagReplied) + string(FlagSeen) + string(FlagFlagged),
+		basename + string(FlagReplied) + string(FlagFlagged),
+
+		basename + string(FlagSeen) + string(FlagFlagged),
+	}
+}
+
 // Filename returns the path to the file corresponding to the key.
 func (d Dir) Filename(key string) (string, error) {
+	// before doing an expensive Glob, see if we can guess the path based on some
+	// common flags
+	for _, guess := range d.filenameGuesses(key) {
+		if _, err := os.Stat(guess); err == nil {
+			return guess, nil
+		}
+	}
 	matches, err := filepath.Glob(filepath.Join(string(d), "cur", key+"*"))
 	if err != nil {
 		return "", err
-- 
2.22.0
View this thread in the archives