caddyhttp: Optimize large host matchers

This commit is contained in:
Matthew Holt 2020-12-02 13:26:28 -07:00
parent 4cff36d731
commit 9157051f45
No known key found for this signature in database
GPG key ID: 2A349DD577D586A5
2 changed files with 95 additions and 0 deletions

View file

@ -23,6 +23,7 @@ import (
"net/url"
"path/filepath"
"regexp"
"sort"
"strings"
"github.com/caddyserver/caddy/v2"
@ -51,6 +52,8 @@ type (
//
// The wildcard can be useful for matching all subdomains, for example:
// `*.example.com` matches `foo.example.com` but not `foo.bar.example.com`.
//
// Duplicate entries will return an error.
MatchHost []string
// MatchPath matches requests by the URI's path (case-insensitive). Path
@ -167,6 +170,40 @@ func (m *MatchHost) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
return nil
}
// Provision sets up and validates m, including making it more efficient for large lists.
func (m MatchHost) Provision(_ caddy.Context) error {
// check for duplicates; they are nonsensical and reduce efficiency
// (we could just remove them, but the user should know their config is erroneous)
seen := make(map[string]int)
for i, h := range m {
h = strings.ToLower(h)
if firstI, ok := seen[h]; ok {
return fmt.Errorf("host at index %d is repeated at index %d: %s", firstI, i, h)
}
seen[h] = i
}
if m.large() {
// sort the slice lexicographically, grouping "fuzzy" entries (wildcards and placeholders)
// at the front of the list; this allows us to use binary search for exact matches, which
// we have seen from experience is the most common kind of value in large lists; and any
// other kinds of values (wildcards and placeholders) are grouped in front so the linear
// search should find a match fairly quickly
sort.Slice(m, func(i, j int) bool {
iInexact, jInexact := m.fuzzy(m[i]), m.fuzzy(m[j])
if iInexact && !jInexact {
return true
}
if !iInexact && jInexact {
return false
}
return m[i] < m[j]
})
}
return nil
}
// Match returns true if r matches m.
func (m MatchHost) Match(r *http.Request) bool {
reqHost, _, err := net.SplitHostPort(r.Host)
@ -179,10 +216,31 @@ func (m MatchHost) Match(r *http.Request) bool {
reqHost = strings.TrimSuffix(reqHost, "]")
}
if m.large() {
// fast path: locate exact match using binary search (about 100-1000x faster for large lists)
pos := sort.Search(len(m), func(i int) bool {
if m.fuzzy(m[i]) {
return false
}
return m[i] >= reqHost
})
if pos < len(m) && m[pos] == reqHost {
return true
}
}
repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
outer:
for _, host := range m {
// fast path: if matcher is large, we already know we don't have an exact
// match, so we're only looking for fuzzy match now, which should be at the
// front of the list; if we have reached a value that is not fuzzy, there
// will be no match and we can short-circuit for efficiency
if m.large() && !m.fuzzy(host) {
break
}
host = repl.ReplaceAll(host, "")
if strings.Contains(host, "*") {
patternParts := strings.Split(host, ".")
@ -207,6 +265,15 @@ outer:
return false
}
// fuzzy returns true if the given hostname h is not a specific
// hostname, e.g. has placeholders or wildcards.
func (MatchHost) fuzzy(h string) bool { return strings.ContainsAny(h, "{*") }
// large returns true if m is considered to be large. Optimizing
// the matcher for smaller lists has diminishing returns.
// See related benchmark function in test file to conduct experiments.
func (m MatchHost) large() bool { return len(m) > 100 }
// CaddyModule returns the Caddy module information.
func (MatchPath) CaddyModule() caddy.ModuleInfo {
return caddy.ModuleInfo{
@ -909,6 +976,7 @@ const regexpPlaceholderPrefix = "http.regexp"
// Interface guards
var (
_ RequestMatcher = (*MatchHost)(nil)
_ caddy.Provisioner = (*MatchHost)(nil)
_ RequestMatcher = (*MatchPath)(nil)
_ RequestMatcher = (*MatchPathRE)(nil)
_ caddy.Provisioner = (*MatchPathRE)(nil)

View file

@ -1018,6 +1018,33 @@ func TestNotMatcher(t *testing.T) {
}
}
}
func BenchmarkLargeHostMatcher(b *testing.B) {
// this benchmark simulates a large host matcher (thousands of entries) where each
// value is an exact hostname (not a placeholder or wildcard) - compare the results
// of this with and without the binary search (comment out the various fast path
// sections in Match) to conduct experiments
const n = 10000
lastHost := fmt.Sprintf("%d.example.com", n-1)
req := &http.Request{Host: lastHost}
repl := caddy.NewReplacer()
ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl)
req = req.WithContext(ctx)
matcher := make(MatchHost, n)
for i := 0; i < n; i++ {
matcher[i] = fmt.Sprintf("%d.example.com", i)
}
err := matcher.Provision(caddy.Context{})
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
matcher.Match(req)
}
}
func BenchmarkHostMatcherWithoutPlaceholder(b *testing.B) {
req := &http.Request{Host: "localhost"}