From 9157051f45b8c1354b6c8432457ca4930ba90d9e Mon Sep 17 00:00:00 2001 From: Matthew Holt Date: Wed, 2 Dec 2020 13:26:28 -0700 Subject: [PATCH] caddyhttp: Optimize large host matchers --- modules/caddyhttp/matchers.go | 68 ++++++++++++++++++++++++++++++ modules/caddyhttp/matchers_test.go | 27 ++++++++++++ 2 files changed, 95 insertions(+) diff --git a/modules/caddyhttp/matchers.go b/modules/caddyhttp/matchers.go index d641cb68..c5cf21d8 100644 --- a/modules/caddyhttp/matchers.go +++ b/modules/caddyhttp/matchers.go @@ -23,6 +23,7 @@ import ( "net/url" "path/filepath" "regexp" + "sort" "strings" "github.com/caddyserver/caddy/v2" @@ -51,6 +52,8 @@ type ( // // The wildcard can be useful for matching all subdomains, for example: // `*.example.com` matches `foo.example.com` but not `foo.bar.example.com`. + // + // Duplicate entries will return an error. MatchHost []string // MatchPath matches requests by the URI's path (case-insensitive). Path @@ -167,6 +170,40 @@ func (m *MatchHost) UnmarshalCaddyfile(d *caddyfile.Dispenser) error { return nil } +// Provision sets up and validates m, including making it more efficient for large lists. +func (m MatchHost) Provision(_ caddy.Context) error { + // check for duplicates; they are nonsensical and reduce efficiency + // (we could just remove them, but the user should know their config is erroneous) + seen := make(map[string]int) + for i, h := range m { + h = strings.ToLower(h) + if firstI, ok := seen[h]; ok { + return fmt.Errorf("host at index %d is repeated at index %d: %s", firstI, i, h) + } + seen[h] = i + } + + if m.large() { + // sort the slice lexicographically, grouping "fuzzy" entries (wildcards and placeholders) + // at the front of the list; this allows us to use binary search for exact matches, which + // we have seen from experience is the most common kind of value in large lists; and any + // other kinds of values (wildcards and placeholders) are grouped in front so the linear + // search should find a match fairly quickly + sort.Slice(m, func(i, j int) bool { + iInexact, jInexact := m.fuzzy(m[i]), m.fuzzy(m[j]) + if iInexact && !jInexact { + return true + } + if !iInexact && jInexact { + return false + } + return m[i] < m[j] + }) + } + + return nil +} + // Match returns true if r matches m. func (m MatchHost) Match(r *http.Request) bool { reqHost, _, err := net.SplitHostPort(r.Host) @@ -179,10 +216,31 @@ func (m MatchHost) Match(r *http.Request) bool { reqHost = strings.TrimSuffix(reqHost, "]") } + if m.large() { + // fast path: locate exact match using binary search (about 100-1000x faster for large lists) + pos := sort.Search(len(m), func(i int) bool { + if m.fuzzy(m[i]) { + return false + } + return m[i] >= reqHost + }) + if pos < len(m) && m[pos] == reqHost { + return true + } + } + repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer) outer: for _, host := range m { + // fast path: if matcher is large, we already know we don't have an exact + // match, so we're only looking for fuzzy match now, which should be at the + // front of the list; if we have reached a value that is not fuzzy, there + // will be no match and we can short-circuit for efficiency + if m.large() && !m.fuzzy(host) { + break + } + host = repl.ReplaceAll(host, "") if strings.Contains(host, "*") { patternParts := strings.Split(host, ".") @@ -207,6 +265,15 @@ outer: return false } +// fuzzy returns true if the given hostname h is not a specific +// hostname, e.g. has placeholders or wildcards. +func (MatchHost) fuzzy(h string) bool { return strings.ContainsAny(h, "{*") } + +// large returns true if m is considered to be large. Optimizing +// the matcher for smaller lists has diminishing returns. +// See related benchmark function in test file to conduct experiments. +func (m MatchHost) large() bool { return len(m) > 100 } + // CaddyModule returns the Caddy module information. func (MatchPath) CaddyModule() caddy.ModuleInfo { return caddy.ModuleInfo{ @@ -909,6 +976,7 @@ const regexpPlaceholderPrefix = "http.regexp" // Interface guards var ( _ RequestMatcher = (*MatchHost)(nil) + _ caddy.Provisioner = (*MatchHost)(nil) _ RequestMatcher = (*MatchPath)(nil) _ RequestMatcher = (*MatchPathRE)(nil) _ caddy.Provisioner = (*MatchPathRE)(nil) diff --git a/modules/caddyhttp/matchers_test.go b/modules/caddyhttp/matchers_test.go index 51db67d8..ab4487d1 100644 --- a/modules/caddyhttp/matchers_test.go +++ b/modules/caddyhttp/matchers_test.go @@ -1018,6 +1018,33 @@ func TestNotMatcher(t *testing.T) { } } } +func BenchmarkLargeHostMatcher(b *testing.B) { + // this benchmark simulates a large host matcher (thousands of entries) where each + // value is an exact hostname (not a placeholder or wildcard) - compare the results + // of this with and without the binary search (comment out the various fast path + // sections in Match) to conduct experiments + + const n = 10000 + lastHost := fmt.Sprintf("%d.example.com", n-1) + req := &http.Request{Host: lastHost} + repl := caddy.NewReplacer() + ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl) + req = req.WithContext(ctx) + + matcher := make(MatchHost, n) + for i := 0; i < n; i++ { + matcher[i] = fmt.Sprintf("%d.example.com", i) + } + err := matcher.Provision(caddy.Context{}) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + matcher.Match(req) + } +} func BenchmarkHostMatcherWithoutPlaceholder(b *testing.B) { req := &http.Request{Host: "localhost"}