caddyhttp: Optimize large host matchers

2025-03-28 17:05:34 +03:00 · 2020-12-02 13:26:28 -07:00 · 2020-12-02 13:26:28 -07:00 · 9157051f45
commit 9157051f45
parent 4cff36d731
2 changed files with 95 additions and 0 deletions
--- a/modules/caddyhttp/matchers.go
+++ b/modules/caddyhttp/matchers.go
@ -23,6 +23,7 @@ import (
 	"net/url"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"

 	"github.com/caddyserver/caddy/v2"
@ -51,6 +52,8 @@ type (
 	//
 	// The wildcard can be useful for matching all subdomains, for example:
 	// `*.example.com` matches `foo.example.com` but not `foo.bar.example.com`.
+	//
+	// Duplicate entries will return an error.
 	MatchHost []string

 	// MatchPath matches requests by the URI's path (case-insensitive). Path
@ -167,6 +170,40 @@ func (m *MatchHost) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
 	return nil
 }

+// Provision sets up and validates m, including making it more efficient for large lists.
+func (m MatchHost) Provision(_ caddy.Context) error {
+	// check for duplicates; they are nonsensical and reduce efficiency
+	// (we could just remove them, but the user should know their config is erroneous)
+	seen := make(map[string]int)
+	for i, h := range m {
+		h = strings.ToLower(h)
+		if firstI, ok := seen[h]; ok {
+			return fmt.Errorf("host at index %d is repeated at index %d: %s", firstI, i, h)
+		}
+		seen[h] = i
+	}
+
+	if m.large() {
+		// sort the slice lexicographically, grouping "fuzzy" entries (wildcards and placeholders)
+		// at the front of the list; this allows us to use binary search for exact matches, which
+		// we have seen from experience is the most common kind of value in large lists; and any
+		// other kinds of values (wildcards and placeholders) are grouped in front so the linear
+		// search should find a match fairly quickly
+		sort.Slice(m, func(i, j int) bool {
+			iInexact, jInexact := m.fuzzy(m[i]), m.fuzzy(m[j])
+			if iInexact && !jInexact {
+				return true
+			}
+			if !iInexact && jInexact {
+				return false
+			}
+			return m[i] < m[j]
+		})
+	}
+
+	return nil
+}
+
 // Match returns true if r matches m.
 func (m MatchHost) Match(r *http.Request) bool {
 	reqHost, _, err := net.SplitHostPort(r.Host)
@ -179,10 +216,31 @@ func (m MatchHost) Match(r *http.Request) bool {
 		reqHost = strings.TrimSuffix(reqHost, "]")
 	}

+	if m.large() {
+		// fast path: locate exact match using binary search (about 100-1000x faster for large lists)
+		pos := sort.Search(len(m), func(i int) bool {
+			if m.fuzzy(m[i]) {
+				return false
+			}
+			return m[i] >= reqHost
+		})
+		if pos < len(m) && m[pos] == reqHost {
+			return true
+		}
+	}
+
 	repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)

 outer:
 	for _, host := range m {
+		// fast path: if matcher is large, we already know we don't have an exact
+		// match, so we're only looking for fuzzy match now, which should be at the
+		// front of the list; if we have reached a value that is not fuzzy, there
+		// will be no match and we can short-circuit for efficiency
+		if m.large() && !m.fuzzy(host) {
+			break
+		}
+
 		host = repl.ReplaceAll(host, "")
 		if strings.Contains(host, "*") {
 			patternParts := strings.Split(host, ".")
@ -207,6 +265,15 @@ outer:
 	return false
 }

+// fuzzy returns true if the given hostname h is not a specific
+// hostname, e.g. has placeholders or wildcards.
+func (MatchHost) fuzzy(h string) bool { return strings.ContainsAny(h, "{*") }
+
+// large returns true if m is considered to be large. Optimizing
+// the matcher for smaller lists has diminishing returns.
+// See related benchmark function in test file to conduct experiments.
+func (m MatchHost) large() bool { return len(m) > 100 }
+
 // CaddyModule returns the Caddy module information.
 func (MatchPath) CaddyModule() caddy.ModuleInfo {
 	return caddy.ModuleInfo{
@ -909,6 +976,7 @@ const regexpPlaceholderPrefix = "http.regexp"
 // Interface guards
 var (
 	_ RequestMatcher    = (*MatchHost)(nil)
+	_ caddy.Provisioner = (*MatchHost)(nil)
 	_ RequestMatcher    = (*MatchPath)(nil)
 	_ RequestMatcher    = (*MatchPathRE)(nil)
 	_ caddy.Provisioner = (*MatchPathRE)(nil)
--- a/modules/caddyhttp/matchers_test.go
+++ b/modules/caddyhttp/matchers_test.go
@ -1018,6 +1018,33 @@ func TestNotMatcher(t *testing.T) {
 		}
 	}
 }
+func BenchmarkLargeHostMatcher(b *testing.B) {
+	// this benchmark simulates a large host matcher (thousands of entries) where each
+	// value is an exact hostname (not a placeholder or wildcard) - compare the results
+	// of this with and without the binary search (comment out the various fast path
+	// sections in Match) to conduct experiments
+
+	const n = 10000
+	lastHost := fmt.Sprintf("%d.example.com", n-1)
+	req := &http.Request{Host: lastHost}
+	repl := caddy.NewReplacer()
+	ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl)
+	req = req.WithContext(ctx)
+
+	matcher := make(MatchHost, n)
+	for i := 0; i < n; i++ {
+		matcher[i] = fmt.Sprintf("%d.example.com", i)
+	}
+	err := matcher.Provision(caddy.Context{})
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		matcher.Match(req)
+	}
+}

 func BenchmarkHostMatcherWithoutPlaceholder(b *testing.B) {
 	req := &http.Request{Host: "localhost"}