mirror of
https://github.com/caddyserver/caddy.git
synced 2024-12-27 06:03:48 +03:00
caddyhttp: Optimize large host matchers
This commit is contained in:
parent
4cff36d731
commit
9157051f45
2 changed files with 95 additions and 0 deletions
|
@ -23,6 +23,7 @@ import (
|
|||
"net/url"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/caddyserver/caddy/v2"
|
||||
|
@ -51,6 +52,8 @@ type (
|
|||
//
|
||||
// The wildcard can be useful for matching all subdomains, for example:
|
||||
// `*.example.com` matches `foo.example.com` but not `foo.bar.example.com`.
|
||||
//
|
||||
// Duplicate entries will return an error.
|
||||
MatchHost []string
|
||||
|
||||
// MatchPath matches requests by the URI's path (case-insensitive). Path
|
||||
|
@ -167,6 +170,40 @@ func (m *MatchHost) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Provision sets up and validates m, including making it more efficient for large lists.
|
||||
func (m MatchHost) Provision(_ caddy.Context) error {
|
||||
// check for duplicates; they are nonsensical and reduce efficiency
|
||||
// (we could just remove them, but the user should know their config is erroneous)
|
||||
seen := make(map[string]int)
|
||||
for i, h := range m {
|
||||
h = strings.ToLower(h)
|
||||
if firstI, ok := seen[h]; ok {
|
||||
return fmt.Errorf("host at index %d is repeated at index %d: %s", firstI, i, h)
|
||||
}
|
||||
seen[h] = i
|
||||
}
|
||||
|
||||
if m.large() {
|
||||
// sort the slice lexicographically, grouping "fuzzy" entries (wildcards and placeholders)
|
||||
// at the front of the list; this allows us to use binary search for exact matches, which
|
||||
// we have seen from experience is the most common kind of value in large lists; and any
|
||||
// other kinds of values (wildcards and placeholders) are grouped in front so the linear
|
||||
// search should find a match fairly quickly
|
||||
sort.Slice(m, func(i, j int) bool {
|
||||
iInexact, jInexact := m.fuzzy(m[i]), m.fuzzy(m[j])
|
||||
if iInexact && !jInexact {
|
||||
return true
|
||||
}
|
||||
if !iInexact && jInexact {
|
||||
return false
|
||||
}
|
||||
return m[i] < m[j]
|
||||
})
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Match returns true if r matches m.
|
||||
func (m MatchHost) Match(r *http.Request) bool {
|
||||
reqHost, _, err := net.SplitHostPort(r.Host)
|
||||
|
@ -179,10 +216,31 @@ func (m MatchHost) Match(r *http.Request) bool {
|
|||
reqHost = strings.TrimSuffix(reqHost, "]")
|
||||
}
|
||||
|
||||
if m.large() {
|
||||
// fast path: locate exact match using binary search (about 100-1000x faster for large lists)
|
||||
pos := sort.Search(len(m), func(i int) bool {
|
||||
if m.fuzzy(m[i]) {
|
||||
return false
|
||||
}
|
||||
return m[i] >= reqHost
|
||||
})
|
||||
if pos < len(m) && m[pos] == reqHost {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
repl := r.Context().Value(caddy.ReplacerCtxKey).(*caddy.Replacer)
|
||||
|
||||
outer:
|
||||
for _, host := range m {
|
||||
// fast path: if matcher is large, we already know we don't have an exact
|
||||
// match, so we're only looking for fuzzy match now, which should be at the
|
||||
// front of the list; if we have reached a value that is not fuzzy, there
|
||||
// will be no match and we can short-circuit for efficiency
|
||||
if m.large() && !m.fuzzy(host) {
|
||||
break
|
||||
}
|
||||
|
||||
host = repl.ReplaceAll(host, "")
|
||||
if strings.Contains(host, "*") {
|
||||
patternParts := strings.Split(host, ".")
|
||||
|
@ -207,6 +265,15 @@ outer:
|
|||
return false
|
||||
}
|
||||
|
||||
// fuzzy returns true if the given hostname h is not a specific
|
||||
// hostname, e.g. has placeholders or wildcards.
|
||||
func (MatchHost) fuzzy(h string) bool { return strings.ContainsAny(h, "{*") }
|
||||
|
||||
// large returns true if m is considered to be large. Optimizing
|
||||
// the matcher for smaller lists has diminishing returns.
|
||||
// See related benchmark function in test file to conduct experiments.
|
||||
func (m MatchHost) large() bool { return len(m) > 100 }
|
||||
|
||||
// CaddyModule returns the Caddy module information.
|
||||
func (MatchPath) CaddyModule() caddy.ModuleInfo {
|
||||
return caddy.ModuleInfo{
|
||||
|
@ -909,6 +976,7 @@ const regexpPlaceholderPrefix = "http.regexp"
|
|||
// Interface guards
|
||||
var (
|
||||
_ RequestMatcher = (*MatchHost)(nil)
|
||||
_ caddy.Provisioner = (*MatchHost)(nil)
|
||||
_ RequestMatcher = (*MatchPath)(nil)
|
||||
_ RequestMatcher = (*MatchPathRE)(nil)
|
||||
_ caddy.Provisioner = (*MatchPathRE)(nil)
|
||||
|
|
|
@ -1018,6 +1018,33 @@ func TestNotMatcher(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
func BenchmarkLargeHostMatcher(b *testing.B) {
|
||||
// this benchmark simulates a large host matcher (thousands of entries) where each
|
||||
// value is an exact hostname (not a placeholder or wildcard) - compare the results
|
||||
// of this with and without the binary search (comment out the various fast path
|
||||
// sections in Match) to conduct experiments
|
||||
|
||||
const n = 10000
|
||||
lastHost := fmt.Sprintf("%d.example.com", n-1)
|
||||
req := &http.Request{Host: lastHost}
|
||||
repl := caddy.NewReplacer()
|
||||
ctx := context.WithValue(req.Context(), caddy.ReplacerCtxKey, repl)
|
||||
req = req.WithContext(ctx)
|
||||
|
||||
matcher := make(MatchHost, n)
|
||||
for i := 0; i < n; i++ {
|
||||
matcher[i] = fmt.Sprintf("%d.example.com", i)
|
||||
}
|
||||
err := matcher.Provision(caddy.Context{})
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
matcher.Match(req)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkHostMatcherWithoutPlaceholder(b *testing.B) {
|
||||
req := &http.Request{Host: "localhost"}
|
||||
|
|
Loading…
Reference in a new issue