diff --git a/go.mod b/go.mod
index b44c040274..499e215fd1 100644
--- a/go.mod
+++ b/go.mod
@@ -11,10 +11,10 @@ require (
 	gitea.com/macaron/captcha v0.0.0-20190822015246-daa973478bae
 	gitea.com/macaron/cors v0.0.0-20190826180238-95aec09ea8b4
 	gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
-	gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae
+	gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5
 	gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
 	gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
-	gitea.com/macaron/macaron v1.4.0
+	gitea.com/macaron/macaron v1.5.0
 	gitea.com/macaron/session v0.0.0-20191207215012-613cebf0674d
 	gitea.com/macaron/toolbox v0.0.0-20190822013122-05ff0fc766b7
 	github.com/BurntSushi/toml v0.3.1
@@ -61,7 +61,7 @@ require (
 	github.com/jmhodges/levigo v1.0.0 // indirect
 	github.com/kballard/go-shellquote v0.0.0-20170619183022-cd60e84ee657
 	github.com/keybase/go-crypto v0.0.0-20200123153347-de78d2cb44f4
-	github.com/klauspost/compress v1.10.2
+	github.com/klauspost/compress v1.10.11
 	github.com/lafriks/xormstore v1.3.2
 	github.com/lib/pq v1.7.0
 	github.com/lunny/dingtalk_webhook v0.0.0-20171025031554-e3534c89ef96
@@ -104,7 +104,7 @@ require (
 	github.com/yuin/goldmark v1.2.1
 	github.com/yuin/goldmark-highlighting v0.0.0-20200307114337-60d527fdb691
 	github.com/yuin/goldmark-meta v0.0.0-20191126180153-f0638e958b60
-	golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de
+	golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a
 	golang.org/x/net v0.0.0-20200707034311-ab3426394381
 	golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
 	golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae
@@ -115,7 +115,7 @@ require (
 	gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
 	gopkg.in/asn1-ber.v1 v1.0.0-20150924051756-4e86f4367175 // indirect
 	gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
-	gopkg.in/ini.v1 v1.57.0
+	gopkg.in/ini.v1 v1.60.2
 	gopkg.in/ldap.v3 v3.0.2
 	gopkg.in/yaml.v2 v2.3.0
 	mvdan.cc/xurls/v2 v2.1.0
diff --git a/go.sum b/go.sum
index 7d4ca0ffb5..3ea4c6e220 100644
--- a/go.sum
+++ b/go.sum
@@ -23,8 +23,8 @@ gitea.com/macaron/cors v0.0.0-20190826180238-95aec09ea8b4 h1:e2rAFDejB0qN8OrY4xP
 gitea.com/macaron/cors v0.0.0-20190826180238-95aec09ea8b4/go.mod h1:rtOK4J20kpMD9XcNsnO5YA843YSTe/MUMbDj/TJ/Q7A=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439 h1:88c34YM29a1GlWLrLBaG/GTT2htDdJz1u3n9+lmPolg=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439/go.mod h1:IsQPHx73HnnqFBYiVHjg87q4XBZyGXXu77xANukvZuk=
-gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae h1:OXxYwGmGNfYrC0/sUUL9KSvr2Sfvmzwgd2YD65vIjGE=
-gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae/go.mod h1:jGHtoovArcQj+sw7NJxyPgjuRxOSG9a/oFu3VkLRTKQ=
+gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5 h1:6rbhThlqfOb+sSmhrsVFz3bZoAeoloe7TZqyeiPbbWI=
+gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5/go.mod h1:z8vCjuhqDfvzPUJDowGqbsgoeYBvDbl95S5k6y43Pxo=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223 h1:iZWwQif/LHMjBgfY/ua8CFVa4XMDfbbs7EZ0Q1dYguU=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223/go.mod h1:+qsc10s4hBsHKU/9luGGumFh4m5FFVc7uih+8/mM1NY=
 gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
@@ -33,8 +33,8 @@ gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a/go.mod h1:h6E4kLao1Y
 gitea.com/macaron/macaron v1.3.3-0.20190803174002-53e005ff4827/go.mod h1:/rvxMjIkOq4BM8uPUb+VHuU02ZfAO6R4+wD//tiCiRw=
 gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA=
 gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs=
-gitea.com/macaron/macaron v1.4.0 h1:FY1QDGqyuUzs21K6ChkbYbRUfwL7v2aUrhNEJ0IgsAw=
-gitea.com/macaron/macaron v1.4.0/go.mod h1:P7hfDbQjcW22lkYkXlxdRIfWOXxH2+K4EogN4Q0UlLY=
+gitea.com/macaron/macaron v1.5.0 h1:TvWEcHw1/zaHlo0GTuKEukLh3A99+QsU2mjBrXLXjVQ=
+gitea.com/macaron/macaron v1.5.0/go.mod h1:P7hfDbQjcW22lkYkXlxdRIfWOXxH2+K4EogN4Q0UlLY=
 gitea.com/macaron/session v0.0.0-20190821211443-122c47c5f705 h1:mvkQGAlON1Z6Y8pqa/+FpYIskk54mazuECUfZK5oTg0=
 gitea.com/macaron/session v0.0.0-20190821211443-122c47c5f705/go.mod h1:1ujH0jD6Ca4iK9NL0Q2a7fG2chvXx5hVa7hBfABwpkA=
 gitea.com/macaron/session v0.0.0-20191207215012-613cebf0674d h1:XLww3CvnFZkXVwauN67fniDaIpIqsE+9KVcxlZKlvLU=
@@ -489,8 +489,8 @@ github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvW
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.10.2 h1:Znfn6hXZAHaLPNnlqUYRrBSReFHYybslgv4PTiyz6P0=
-github.com/klauspost/compress v1.10.2/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.10.11 h1:K9z59aO18Aywg2b/WSgBaUX99mHy2BES18Cr5lBKZHk=
+github.com/klauspost/compress v1.10.11/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs=
 github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
@@ -819,8 +819,8 @@ golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073/go.mod h1:LzIPMQfyMNhhGPh
 golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de h1:ikNHVSjEfnvz6sxdSPCaPt572qowuyMDMJLLm3Db3ig=
-golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a h1:vclmkQCjlDX5OydZ9wv8rBCcS0QyQY66Mpf/7BZbInM=
+golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
@@ -1014,6 +1014,10 @@ gopkg.in/ini.v1 v1.44.2/go.mod h1:M3Cogqpuv0QCi3ExAY5V4uOt4qb/R3xZubo9m8lK5wg=
 gopkg.in/ini.v1 v1.46.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ini.v1 v1.57.0 h1:9unxIsFcTt4I55uWluz+UmL95q4kdJ0buvQ1ZIqVQww=
 gopkg.in/ini.v1 v1.57.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/ini.v1 v1.60.1 h1:P5y5shSkb0CFe44qEeMBgn8JLow09MP17jlJHanke5g=
+gopkg.in/ini.v1 v1.60.1/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/ini.v1 v1.60.2 h1:7i8mqModL63zqi8nQn8Q3+0zvSCZy1AxhBgthKfi4WU=
+gopkg.in/ini.v1 v1.60.2/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ldap.v3 v3.0.2 h1:R6RBtabK6e1GO0eQKtkyOFbAHO73QesLzI2w2DZ6b9w=
 gopkg.in/ldap.v3 v3.0.2/go.mod h1:oxD7NyBuxchC+SgJDE1Q5Od05eGt29SDQVBmV+HYbzw=
 gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
diff --git a/modules/test/context_tests.go b/modules/test/context_tests.go
index af47369ee1..874d7db196 100644
--- a/modules/test/context_tests.go
+++ b/modules/test/context_tests.go
@@ -117,6 +117,10 @@ func (rw *mockResponseWriter) Before(b macaron.BeforeFunc) {
+func (rw *mockResponseWriter) Push(target string, opts *http.PushOptions) error {
+	return nil
 type mockRender struct {
diff --git a/vendor/gitea.com/macaron/gzip/.drone.yml b/vendor/gitea.com/macaron/gzip/.drone.yml
index e55afaac04..087a19664c 100644
--- a/vendor/gitea.com/macaron/gzip/.drone.yml
+++ b/vendor/gitea.com/macaron/gzip/.drone.yml
@@ -1,9 +1,9 @@
 kind: pipeline
-name: go1-1-2
+name: go1-14
 - name: test
-  image: golang:1.12
+  image: golang:1.14
     GOPROXY: https://goproxy.cn
@@ -12,11 +12,11 @@ steps:
 kind: pipeline
-name: go1-1-3
+name: go1-15
 - name: test
-  image: golang:1.13
+  image: golang:1.15
     GOPROXY: https://goproxy.cn
diff --git a/vendor/gitea.com/macaron/gzip/go.mod b/vendor/gitea.com/macaron/gzip/go.mod
index e66cacaa30..877ee7e37a 100644
--- a/vendor/gitea.com/macaron/gzip/go.mod
+++ b/vendor/gitea.com/macaron/gzip/go.mod
@@ -3,7 +3,9 @@ module gitea.com/macaron/gzip
 go 1.12
 require (
-	gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb
+	gitea.com/macaron/macaron v1.5.0
 	github.com/klauspost/compress v1.9.2
 	github.com/stretchr/testify v1.4.0
+	golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a // indirect
+	gopkg.in/ini.v1 v1.60.1 // indirect
diff --git a/vendor/gitea.com/macaron/gzip/go.sum b/vendor/gitea.com/macaron/gzip/go.sum
index 292be5f957..b34055da3e 100644
--- a/vendor/gitea.com/macaron/gzip/go.sum
+++ b/vendor/gitea.com/macaron/gzip/go.sum
@@ -1,7 +1,7 @@
-gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591 h1:UbCTjPcLrNxR9LzKDjQBMT2zoxZuEnca1pZCpgeMuhQ=
-gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
-gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA=
-gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs=
+gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a h1:aOKEXkDTnh4euoH0so/THLXeHtQuqHmDPb1xEk6Ehok=
+gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
+gitea.com/macaron/macaron v1.5.0 h1:TvWEcHw1/zaHlo0GTuKEukLh3A99+QsU2mjBrXLXjVQ=
+gitea.com/macaron/macaron v1.5.0/go.mod h1:P7hfDbQjcW22lkYkXlxdRIfWOXxH2+K4EogN4Q0UlLY=
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
@@ -17,17 +17,21 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304 h1:Jpy1PXuP99tXNrhbq2BaPz9B+jNAvH1JPQQpG/9GCXY=
 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
+github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHeiJApdr3r4w=
+github.com/smartystreets/assertions v1.0.1/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM=
 github.com/smartystreets/goconvey v0.0.0-20181108003508-044398e4856c/go.mod h1:XDJAKZRPZ1CvBcN2aX5YOUTYGHki24fSF0Iv48Ibg0s=
 github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 h1:WN9BUFbdyOsSH/XohnWpXOlq9NBD5sGAB2FciQMUEe8=
 github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
-github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6xOX5DbxZEXolK+nBSvmsQwRjM=
-github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
+github.com/unknwon/com v1.0.1 h1:3d1LTxD+Lnf3soQiD4Cp/0BRB+Rsa/+RTvz8GMMzIXs=
+github.com/unknwon/com v1.0.1/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc=
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a h1:vclmkQCjlDX5OydZ9wv8rBCcS0QyQY66Mpf/7BZbInM=
+golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -38,5 +42,7 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/ini.v1 v1.44.0 h1:YRJzTUp0kSYWUVFF5XAbDFfyiqwsl0Vb9R8TVP5eRi0=
 gopkg.in/ini.v1 v1.44.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/ini.v1 v1.60.1 h1:P5y5shSkb0CFe44qEeMBgn8JLow09MP17jlJHanke5g=
+gopkg.in/ini.v1 v1.60.1/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/vendor/gitea.com/macaron/gzip/gzip.go b/vendor/gitea.com/macaron/gzip/gzip.go
index 9573d167ab..cd93e03f68 100644
--- a/vendor/gitea.com/macaron/gzip/gzip.go
+++ b/vendor/gitea.com/macaron/gzip/gzip.go
@@ -6,6 +6,7 @@ package gzip
 import (
+	"errors"
@@ -331,6 +332,15 @@ func (proxy *ProxyResponseWriter) Flush() {
+// Push implements http.Pusher for HTTP/2 Push purposes
+func (proxy *ProxyResponseWriter) Push(target string, opts *http.PushOptions) error {
+	pusher, ok := proxy.internal.(http.Pusher)
+	if !ok {
+		return errors.New("the ResponseWriter doesn't support the Pusher interface")
+	}
+	return pusher.Push(target, opts)
 // Hijack implements http.Hijacker. If the underlying ResponseWriter is a
 // Hijacker, its Hijack method is returned. Otherwise an error is returned.
 func (proxy *ProxyResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
diff --git a/vendor/gitea.com/macaron/macaron/.drone.yml b/vendor/gitea.com/macaron/macaron/.drone.yml
index 06ecc018e3..06a3e7d853 100644
--- a/vendor/gitea.com/macaron/macaron/.drone.yml
+++ b/vendor/gitea.com/macaron/macaron/.drone.yml
@@ -3,7 +3,9 @@ name: default
 - name: test
-  image: golang:1.12
+  image: golang:1.13
+  environment:
+    GOPROXY: https://goproxy.cn
   - go get -u
   - go build -v
diff --git a/vendor/gitea.com/macaron/macaron/README.md b/vendor/gitea.com/macaron/macaron/README.md
index 97b782ed87..34830c8b17 100644
--- a/vendor/gitea.com/macaron/macaron/README.md
+++ b/vendor/gitea.com/macaron/macaron/README.md
@@ -1,5 +1,9 @@
-Macaron [![Build Status](https://travis-ci.org/go-macaron/macaron.svg?branch=v1)](https://travis-ci.org/go-macaron/macaron)
+# Macaron
+[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/go-macaron/macaron/Go?logo=github&style=for-the-badge)](https://github.com/go-macaron/macaron/actions?query=workflow%3AGo)
 ![Macaron Logo](https://raw.githubusercontent.com/go-macaron/macaron/v1/macaronlogo.png)
diff --git a/vendor/gitea.com/macaron/macaron/context.go b/vendor/gitea.com/macaron/macaron/context.go
index a3f2c69e03..35a7e7017c 100644
--- a/vendor/gitea.com/macaron/macaron/context.go
+++ b/vendor/gitea.com/macaron/macaron/context.go
@@ -193,9 +193,9 @@ func (ctx *Context) parseForm() {
 	contentType := ctx.Req.Header.Get(_CONTENT_TYPE)
 	if (ctx.Req.Method == "POST" || ctx.Req.Method == "PUT") &&
 		len(contentType) > 0 && strings.Contains(contentType, "multipart/form-data") {
-		ctx.Req.ParseMultipartForm(MaxMemory)
+		_ = ctx.Req.ParseMultipartForm(MaxMemory)
 	} else {
-		ctx.Req.ParseForm()
+		_ = ctx.Req.ParseForm()
@@ -260,6 +260,11 @@ func (ctx *Context) Params(name string) string {
 	return ctx.params[name]
+// AllParams returns all params.
+func (ctx *Context) AllParams() Params {
+	return ctx.params
 // SetParams sets value of param with given name.
 func (ctx *Context) SetParams(name, val string) {
 	if name != "*" && !strings.HasPrefix(name, ":") {
diff --git a/vendor/gitea.com/macaron/macaron/recovery.go b/vendor/gitea.com/macaron/macaron/recovery.go
index 49c3096d64..1d301f7218 100644
--- a/vendor/gitea.com/macaron/macaron/recovery.go
+++ b/vendor/gitea.com/macaron/macaron/recovery.go
@@ -153,7 +153,7 @@ func Recovery() Handler {
 				if nil != body {
-					res.Write(body)
+					_, _ = res.Write(body)
diff --git a/vendor/gitea.com/macaron/macaron/render.go b/vendor/gitea.com/macaron/macaron/render.go
index 6f2e93f21b..04687c4f40 100644
--- a/vendor/gitea.com/macaron/macaron/render.go
+++ b/vendor/gitea.com/macaron/macaron/render.go
@@ -36,7 +36,6 @@ import (
 const (
 	_CONTENT_TYPE    = "Content-Type"
-	_CONTENT_LENGTH  = "Content-Length"
 	_CONTENT_BINARY  = "application/octet-stream"
 	_CONTENT_JSON    = "application/json"
 	_CONTENT_HTML    = "text/html"
@@ -200,7 +199,7 @@ func NewTemplateFileSystem(opt RenderOptions, omitData bool) TplFileSystem {
 	lastDir := dirs[len(dirs)-1]
 	// We still walk the last (original) directory because it's non-sense we load templates not exist in original directory.
-	if err = filepath.Walk(lastDir, func(path string, info os.FileInfo, err error) error {
+	if err = filepath.Walk(lastDir, func(path string, info os.FileInfo, _ error) error {
 		r, err := filepath.Rel(lastDir, path)
 		if err != nil {
 			return err
@@ -458,9 +457,9 @@ func (r *TplRender) JSON(status int, v interface{}) {
 	r.Header().Set(_CONTENT_TYPE, _CONTENT_JSON+r.CompiledCharset)
 	if len(r.Opt.PrefixJSON) > 0 {
-		r.Write(r.Opt.PrefixJSON)
+		_, _ = r.Write(r.Opt.PrefixJSON)
-	r.Write(result)
+	_, _ = r.Write(result)
 func (r *TplRender) JSONString(v interface{}) (string, error) {
@@ -494,9 +493,9 @@ func (r *TplRender) XML(status int, v interface{}) {
 	r.Header().Set(_CONTENT_TYPE, _CONTENT_XML+r.CompiledCharset)
 	if len(r.Opt.PrefixXML) > 0 {
-		r.Write(r.Opt.PrefixXML)
+		_, _ = r.Write(r.Opt.PrefixXML)
-	r.Write(result)
+	_, _ = r.Write(result)
 func (r *TplRender) data(status int, contentType string, v []byte) {
@@ -504,7 +503,7 @@ func (r *TplRender) data(status int, contentType string, v []byte) {
 		r.Header().Set(_CONTENT_TYPE, contentType)
-	r.Write(v)
+	_, _ = r.Write(v)
 func (r *TplRender) RawData(status int, v []byte) {
@@ -612,7 +611,7 @@ func (r *TplRender) HTMLString(name string, data interface{}, htmlOpt ...HTMLOpt
 func (r *TplRender) Error(status int, message ...string) {
 	if len(message) > 0 {
-		r.Write([]byte(message[0]))
+		_, _ = r.Write([]byte(message[0]))
diff --git a/vendor/gitea.com/macaron/macaron/response_writer.go b/vendor/gitea.com/macaron/macaron/response_writer.go
index 9133948f9b..eeb35f642e 100644
--- a/vendor/gitea.com/macaron/macaron/response_writer.go
+++ b/vendor/gitea.com/macaron/macaron/response_writer.go
@@ -16,7 +16,7 @@ package macaron
 import (
-	"fmt"
+	"errors"
@@ -27,6 +27,7 @@ import (
 type ResponseWriter interface {
+	http.Pusher
 	// Status returns the status code of the response or 0 if the response has not been written.
 	Status() int
 	// Written returns whether or not the ResponseWriter has been written.
@@ -91,11 +92,12 @@ func (rw *responseWriter) Before(before BeforeFunc) {
 func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
 	hijacker, ok := rw.ResponseWriter.(http.Hijacker)
 	if !ok {
-		return nil, nil, fmt.Errorf("the ResponseWriter doesn't support the Hijacker interface")
+		return nil, nil, errors.New("the ResponseWriter doesn't support the Hijacker interface")
 	return hijacker.Hijack()
 func (rw *responseWriter) CloseNotify() <-chan bool {
 	return rw.ResponseWriter.(http.CloseNotifier).CloseNotify()
@@ -112,3 +114,11 @@ func (rw *responseWriter) Flush() {
+func (rw *responseWriter) Push(target string, opts *http.PushOptions) error {
+	pusher, ok := rw.ResponseWriter.(http.Pusher)
+	if !ok {
+		return errors.New("the ResponseWriter doesn't support the Pusher interface")
+	}
+	return pusher.Push(target, opts)
diff --git a/vendor/gitea.com/macaron/macaron/return_handler.go b/vendor/gitea.com/macaron/macaron/return_handler.go
index 2e37c78a32..10b2c2283f 100644
--- a/vendor/gitea.com/macaron/macaron/return_handler.go
+++ b/vendor/gitea.com/macaron/macaron/return_handler.go
@@ -68,9 +68,9 @@ func defaultReturnHandler() ReturnHandler {
 			respVal = respVal.Elem()
 		if isByteSlice(respVal) {
-			resp.Write(respVal.Bytes())
+			_, _ = resp.Write(respVal.Bytes())
 		} else {
-			resp.Write([]byte(respVal.String()))
+			_, _ = resp.Write([]byte(respVal.String()))
diff --git a/vendor/gitea.com/macaron/macaron/static.go b/vendor/gitea.com/macaron/macaron/static.go
index 60c521110e..04d91d8c9a 100644
--- a/vendor/gitea.com/macaron/macaron/static.go
+++ b/vendor/gitea.com/macaron/macaron/static.go
@@ -17,6 +17,7 @@ package macaron
 import (
+	"fmt"
@@ -148,9 +149,15 @@ func staticHandler(ctx *Context, log *log.Logger, opt StaticOptions) bool {
 	// Try to serve index file
 	if fi.IsDir() {
+		redirPath := path.Clean(ctx.Req.URL.Path)
+		// path.Clean removes the trailing slash, so we need to add it back when
+		// the original path has it.
+		if strings.HasSuffix(ctx.Req.URL.Path, "/") {
+			redirPath = redirPath + "/"
+		}
 		// Redirect if missing trailing slash.
-		if !strings.HasSuffix(ctx.Req.URL.Path, "/") {
-			http.Redirect(ctx.Resp, ctx.Req.Request, ctx.Req.URL.Path+"/", http.StatusFound)
+		if !strings.HasSuffix(redirPath, "/") {
+			http.Redirect(ctx.Resp, ctx.Req.Request, redirPath+"/", http.StatusFound)
 			return true
@@ -177,8 +184,12 @@ func staticHandler(ctx *Context, log *log.Logger, opt StaticOptions) bool {
 	if opt.ETag {
-		tag := GenerateETag(string(fi.Size()), fi.Name(), fi.ModTime().UTC().Format(http.TimeFormat))
+		tag := `"` + GenerateETag(fmt.Sprintf("%d", fi.Size()), fi.Name(), fi.ModTime().UTC().Format(http.TimeFormat)) + `"`
 		ctx.Resp.Header().Set("ETag", tag)
+		if ctx.Req.Header.Get("If-None-Match") == tag {
+			ctx.Resp.WriteHeader(http.StatusNotModified)
+			return true
+		}
 	http.ServeContent(ctx.Resp, ctx.Req.Request, file, fi.ModTime(), f)
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index 2b101d26b2..25dbe3e15f 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -80,9 +80,7 @@ type advancedState struct {
 	// deflate state
 	length         int
 	offset         int
-	hash           uint32
 	maxInsertIndex int
-	ii             uint16 // position of last match, intended to overflow to reset.
 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
@@ -97,6 +95,9 @@ type advancedState struct {
 	// input window: unprocessed data is window[index:windowEnd]
 	index     int
 	hashMatch [maxMatchLength + minMatchLength]uint32
+	hash uint32
+	ii   uint16 // position of last match, intended to overflow to reset.
 type compressor struct {
@@ -107,18 +108,19 @@ type compressor struct {
 	// compression algorithm
 	fill func(*compressor, []byte) int // copy data to window
 	step func(*compressor)             // process window
-	sync bool                          // requesting flush
-	window        []byte
-	windowEnd     int
-	blockStart    int  // window index where current tokens start
-	byteAvailable bool // if true, still need to process window[index-1].
-	err           error
+	window     []byte
+	windowEnd  int
+	blockStart int // window index where current tokens start
+	err        error
 	// queued output tokens
 	tokens tokens
 	fast   fastEnc
 	state  *advancedState
+	sync          bool // requesting flush
+	byteAvailable bool // if true, still need to process window[index-1].
 func (d *compressor) fillDeflate(b []byte) int {
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
index 7f175a4ec2..3e4259f157 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -295,10 +295,6 @@ type decompressor struct {
 	r       Reader
 	roffset int64
-	// Input bits, in top of b.
-	b  uint32
-	nb uint
 	// Huffman decoders for literal/length, distance.
 	h1, h2 huffmanDecoder
@@ -309,19 +305,24 @@ type decompressor struct {
 	// Output history, buffer.
 	dict dictDecoder
-	// Temporary buffer (avoids repeated allocation).
-	buf [4]byte
 	// Next step in the decompression,
 	// and decompression state.
 	step      func(*decompressor)
 	stepState int
-	final     bool
 	err       error
 	toRead    []byte
 	hl, hd    *huffmanDecoder
 	copyLen   int
 	copyDist  int
+	// Temporary buffer (avoids repeated allocation).
+	buf [4]byte
+	// Input bits, in top of b.
+	b uint32
+	nb    uint
+	final bool
 func (f *decompressor) nextBlock() {
diff --git a/vendor/github.com/klauspost/compress/fse/bitreader.go b/vendor/github.com/klauspost/compress/fse/bitreader.go
index b9db204f59..f65eb3909c 100644
--- a/vendor/github.com/klauspost/compress/fse/bitreader.go
+++ b/vendor/github.com/klauspost/compress/fse/bitreader.go
@@ -6,6 +6,7 @@
 package fse
 import (
+	"encoding/binary"
@@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
@@ -63,8 +68,9 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
-	// Do single re-slice to avoid bounds checks.
-	v := b.in[b.off-4 : b.off]
+	// 2 bounds checks.
+	v := b.in[b.off-4:]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
@@ -77,7 +83,8 @@ func (b *bitReader) fill() {
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -91,9 +98,17 @@ func (b *bitReader) fill() {
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.bitsRead >= 64 && b.off == 0
 // close the bitstream and returns an error if out-of-buffer reads occurred.
diff --git a/vendor/github.com/klauspost/compress/fse/bytereader.go b/vendor/github.com/klauspost/compress/fse/bytereader.go
index f228a46cdf..abade2d605 100644
--- a/vendor/github.com/klauspost/compress/fse/bytereader.go
+++ b/vendor/github.com/klauspost/compress/fse/bytereader.go
@@ -25,19 +25,10 @@ func (b *byteReader) advance(n uint) {
 	b.off += int(n)
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
-	v3 := int32(b2[3])
-	v2 := int32(b2[2])
-	v1 := int32(b2[1])
-	v0 := int32(b2[0])
-	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 // Uint32 returns a little endian uint32 starting at current offset.
 func (b byteReader) Uint32() uint32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
diff --git a/vendor/github.com/klauspost/compress/fse/fse.go b/vendor/github.com/klauspost/compress/fse/fse.go
index 075357b5b1..535cbadfde 100644
--- a/vendor/github.com/klauspost/compress/fse/fse.go
+++ b/vendor/github.com/klauspost/compress/fse/fse.go
@@ -44,18 +44,14 @@ var (
 // Scratch provides temporary storage for compression and decompression.
 type Scratch struct {
 	// Private
-	count          [maxSymbolValue + 1]uint32
-	norm           [maxSymbolValue + 1]int16
-	symbolLen      uint16 // Length of active part of the symbol table.
-	actualTableLog uint8  // Selected tablelog.
-	br             byteReader
-	bits           bitReader
-	bw             bitWriter
-	ct             cTable      // Compression tables.
-	decTable       []decSymbol // Decompression table.
-	zeroBits       bool        // no bits has prob > 50%.
-	clearCount     bool        // clear count
-	maxCount       int         // count of the most probable symbol
+	count    [maxSymbolValue + 1]uint32
+	norm     [maxSymbolValue + 1]int16
+	br       byteReader
+	bits     bitReader
+	bw       bitWriter
+	ct       cTable      // Compression tables.
+	decTable []decSymbol // Decompression table.
+	maxCount int         // count of the most probable symbol
 	// Per block parameters.
 	// These can be used to override compression parameters of the block.
@@ -68,17 +64,22 @@ type Scratch struct {
 	// and allocation will be avoided.
 	Out []byte
-	// MaxSymbolValue will override the maximum symbol value of the next block.
-	MaxSymbolValue uint8
-	// TableLog will attempt to override the tablelog for the next block.
-	TableLog uint8
 	// DecompressLimit limits the maximum decoded size acceptable.
 	// If > 0 decompression will stop when approximately this many bytes
 	// has been decoded.
 	// If 0, maximum size will be 2GB.
 	DecompressLimit int
+	symbolLen      uint16 // Length of active part of the symbol table.
+	actualTableLog uint8  // Selected tablelog.
+	zeroBits       bool   // no bits has prob > 50%.
+	clearCount     bool   // clear count
+	// MaxSymbolValue will override the maximum symbol value of the next block.
+	MaxSymbolValue uint8
+	// TableLog will attempt to override the tablelog for the next block.
+	TableLog uint8
 // Histogram allows to populate the histogram and skip that step in the compression,
diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
index 6794cf48f4..26203851bd 100644
--- a/vendor/github.com/klauspost/compress/gzip/gzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@@ -37,13 +37,13 @@ type Writer struct {
 	Header      // written at first call to Write, Flush, or Close
 	w           io.Writer
 	level       int
-	wroteHeader bool
+	err         error
 	compressor  *flate.Writer
 	digest      uint32 // CRC-32, IEEE polynomial (section 8)
 	size        uint32 // Uncompressed size (section 2.3.1)
+	wroteHeader bool
 	closed      bool
 	buf         [10]byte
-	err         error
 // NewWriter returns a new Writer.
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
index 0a8448ce9f..e12da4db2f 100644
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -12,8 +12,6 @@ but it can be used as a secondary step to compressors (like Snappy) that does no
 * [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
 ## News
  * Mar 2018: First implementation released. Consider this beta software for now.
@@ -75,6 +73,8 @@ which can be given to the decompressor.
 Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
 or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
+For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
 You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
 your input was likely corrupted. 
@@ -84,4 +84,4 @@ There are no integrity checks, so relying on errors from the decompressor does n
 # Contributing
 Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.
\ No newline at end of file
+changes will likely not be accepted. If in doubt open an issue before writing the PR.
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 7d0903c701..a4979e8868 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -6,6 +6,7 @@
 package huff0
 import (
+	"encoding/binary"
@@ -34,29 +35,16 @@ func (b *bitReader) init(in []byte) error {
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
 	return nil
-// getBits will return n bits. n can be 0.
-func (b *bitReader) getBits(n uint8) uint16 {
-	if n == 0 || b.bitsRead >= 64 {
-		return 0
-	}
-	return b.getBitsFast(n)
-// getBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	b.bitsRead += n
-	return v
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) peekBitsFast(n uint8) uint16 {
@@ -71,21 +59,36 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
-	// Do single re-slice to avoid bounds checks.
+	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
+func (b *bitReader) advance(n uint8) {
+	b.bitsRead += n
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -113,3 +116,214 @@ func (b *bitReader) close() error {
 	return nil
+// bitReader reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReaderBytes struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+// init initializes and resets the bit reader.
+func (b *bitReaderBytes) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
+	b.advance(8 - uint8(highBit32(uint32(v))))
+	return nil
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReaderBytes) peekByteFast() uint8 {
+	got := uint8(b.value >> 56)
+	return got
+func (b *bitReaderBytes) advance(n uint8) {
+	b.bitsRead += n
+	b.value <<= n & 63
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReaderBytes) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+	// 2 bounds checks.
+	v := b.in[b.off-4 : b.off]
+	v = v[:4]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value |= uint64(low) << (b.bitsRead - 32)
+	b.bitsRead -= 32
+	b.off -= 4
+// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
+func (b *bitReaderBytes) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+// fill() will make sure at least 32 bits are available.
+func (b *bitReaderBytes) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4:]
+		v = v[:4]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value |= uint64(low) << (b.bitsRead - 32)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
+		b.bitsRead -= 8
+		b.off--
+	}
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReaderBytes) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReaderBytes) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+// bitReaderShifted reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReaderShifted struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+// init initializes and resets the bit reader.
+func (b *bitReaderShifted) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
+	b.advance(8 - uint8(highBit32(uint32(v))))
+	return nil
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
+	return uint16(b.value >> ((64 - n) & 63))
+func (b *bitReaderShifted) advance(n uint8) {
+	b.bitsRead += n
+	b.value <<= n & 63
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReaderShifted) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+	// 2 bounds checks.
+	v := b.in[b.off-4 : b.off]
+	v = v[:4]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
+	b.bitsRead -= 32
+	b.off -= 4
+// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
+func (b *bitReaderShifted) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
+// fill() will make sure at least 32 bits are available.
+func (b *bitReaderShifted) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4:]
+		v = v[:4]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
+		b.bitsRead -= 8
+		b.off--
+	}
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReaderShifted) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReaderShifted) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index bda4021efd..6bce4e87d4 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -43,6 +43,11 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
 func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
 	enc := ct[symbol]
 	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
+	if false {
+		if enc.nBits == 0 {
+			panic("nbits 0")
+		}
+	}
 	b.nBits += enc.nBits
@@ -54,6 +59,14 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	sh := b.nBits & 63
 	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
 	b.bitContainer |= combined << sh
+	if false {
+		if encA.nBits == 0 {
+			panic("nbitsA 0")
+		}
+		if encB.nBits == 0 {
+			panic("nbitsB 0")
+		}
+	}
 	b.nBits += encA.nBits + encB.nBits
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 0843cb014f..f9ed5f8306 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -77,8 +77,11 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 		// Each symbol present maximum once or too well distributed.
 		return nil, false, ErrIncompressible
-	if s.Reuse == ReusePolicyPrefer && canReuse {
+	if s.Reuse == ReusePolicyMust && !canReuse {
+		// We must reuse, but we can't.
+		return nil, false, ErrIncompressible
+	}
+	if (s.Reuse == ReusePolicyPrefer || s.Reuse == ReusePolicyMust) && canReuse {
 		keepTable := s.cTable
 		keepTL := s.actualTableLog
 		s.cTable = s.prevTable
@@ -90,6 +93,9 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 			s.OutData = s.Out
 			return s.Out, true, nil
+		if s.Reuse == ReusePolicyMust {
+			return nil, false, ErrIncompressible
+		}
 		// Do not attempt to re-use later.
 		s.prevTable = s.prevTable[:0]
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 97ae66a4ac..41703bba4d 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -25,11 +25,14 @@ type dEntryDouble struct {
 	len   uint8
+// Uses special code for all tables that are < 8 bits.
+const use8BitTables = true
 // ReadTable will read a table from the input.
 // The size of the input may be larger than the table definition.
 // Any content remaining after the table definition will be returned.
 // If no Scratch is provided a new one is allocated.
-// The returned Scratch can be used for decoding input using this table.
+// The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 	s, err = s.prepare(in)
 	if err != nil {
@@ -55,8 +58,8 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		s.symbolLen = uint16(oSize)
 		in = in[iSize:]
 	} else {
-		if len(in) <= int(iSize) {
-			return s, nil, errors.New("input too small for table")
+		if len(in) < int(iSize) {
+			return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
 		// FSE compressed weights
 		s.fse.DecompressLimit = 255
@@ -83,6 +86,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		v2 := v & 15
+		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
 		weightTotal += (1 << v2) >> 1
 	if weightTotal == 0 {
@@ -134,20 +138,40 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 	if len(s.dt.single) != tSize {
 		s.dt.single = make([]dEntrySingle, tSize)
+	cTable := s.prevTable
+	if cap(cTable) < maxSymbolValue+1 {
+		cTable = make([]cTableEntry, 0, maxSymbolValue+1)
+	}
+	cTable = cTable[:maxSymbolValue+1]
+	s.prevTable = cTable[:s.symbolLen]
+	s.prevTableLog = s.actualTableLog
 	for n, w := range s.huffWeight[:s.symbolLen] {
 		if w == 0 {
+			cTable[n] = cTableEntry{
+				val:   0,
+				nBits: 0,
+			}
 		length := (uint32(1) << w) >> 1
 		d := dEntrySingle{
 			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
-		single := s.dt.single[rankStats[w] : rankStats[w]+length]
+		rank := &rankStats[w]
+		cTable[n] = cTableEntry{
+			val:   uint16(*rank >> (w - 1)),
+			nBits: uint8(d.entry),
+		}
+		single := s.dt.single[*rank : *rank+length]
 		for i := range single {
 			single[i] = d
-		rankStats[w] += length
+		*rank += length
 	return s, in, nil
@@ -155,69 +179,14 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 // The length of the supplied input must match the end of a block exactly.
 // Before this is called, the table must be initialized with ReadTable unless
 // the encoder re-used the table.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
-	if len(s.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
+	if cap(s.Out) < s.MaxDecodedSize {
+		s.Out = make([]byte, s.MaxDecodedSize)
-	var br bitReader
-	err = br.init(in)
-	if err != nil {
-		return nil, err
-	}
-	s.Out = s.Out[:0]
-	decode := func() byte {
-		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := s.dt.single[val]
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
-	hasDec := func(v dEntrySingle) byte {
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
-	// Avoid bounds check by always having full sized table.
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	dt := s.dt.single[:tlSize]
-	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.huffWeight[:256]
-	var off uint8
-	for br.off >= 8 {
-		br.fillFast()
-		tmp[off+0] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		tmp[off+1] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		br.fillFast()
-		tmp[off+2] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		tmp[off+3] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
-		off += 4
-		if off == 0 {
-			if len(s.Out)+256 > s.MaxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			s.Out = append(s.Out, tmp...)
-		}
-	}
-	if len(s.Out)+int(off) > s.MaxDecodedSize {
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	s.Out = append(s.Out, tmp[:off]...)
-	for !br.finished() {
-		br.fill()
-		if len(s.Out) >= s.MaxDecodedSize {
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		s.Out = append(s.Out, decode())
-	}
-	return s.Out, br.close()
+	s.Out = s.Out[:0:s.MaxDecodedSize]
+	s.Out, err = s.Decoder().Decompress1X(s.Out, in)
+	return s.Out, err
 // Decompress4X will decompress a 4X encoded stream.
@@ -225,123 +194,402 @@ func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
 // the encoder re-used the table.
 // The length of the supplied input must match the end of a block exactly.
 // The destination size of the uncompressed data must be known and provided.
+// deprecated: Use the stateless Decoder() to get a concurrent version.
 func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
-	if len(s.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(in) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
 	if dstSize > s.MaxDecodedSize {
 		return nil, ErrMaxDecodedSizeExceeded
-	// TODO: We do not detect when we overrun a buffer, except if the last one does.
+	if cap(s.Out) < dstSize {
+		s.Out = make([]byte, s.MaxDecodedSize)
+	}
+	s.Out = s.Out[:0:dstSize]
+	s.Out, err = s.Decoder().Decompress4X(s.Out, in)
+	return s.Out, err
-	var br [4]bitReader
+// Decoder will return a stateless decoder that can be used by multiple
+// decompressors concurrently.
+// Before this is called, the table must be initialized with ReadTable.
+// The Decoder is still linked to the scratch buffer so that cannot be reused.
+// However, it is safe to discard the scratch.
+func (s *Scratch) Decoder() *Decoder {
+	return &Decoder{
+		dt:             s.dt,
+		actualTableLog: s.actualTableLog,
+	}
+// Decoder provides stateless decoding.
+type Decoder struct {
+	dt             dTable
+	actualTableLog uint8
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress1X8Bit(dst, src)
+	}
+	var br bitReaderShifted
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+	// Avoid bounds check by always having full sized table.
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	dt := d.dt.single[:tlSize]
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	for br.off >= 8 {
+		br.fillFast()
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+		// Refill
+		br.fillFast()
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+	// br < 8, so uint8 is fine
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+	for bitsLeft > 0 {
+		br.fill()
+		if false && br.bitsRead >= 32 {
+			if br.off >= 4 {
+				v := br.in[br.off-4:]
+				v = v[:4]
+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+				br.value = (br.value << 32) | uint64(low)
+				br.bitsRead -= 32
+				br.off -= 4
+			} else {
+				for br.off > 0 {
+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
+					br.bitsRead -= 8
+					br.off--
+				}
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= nBits
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
+	if d.actualTableLog == 8 {
+		return d.decompress1X8BitExactly(dst, src)
+	}
+	var br bitReaderBytes
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+	// Avoid bounds check by always having full sized table.
+	dt := d.dt.single[:256]
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	shift := (8 - d.actualTableLog) & 7
+	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+	for br.off >= 4 {
+		br.fillFast()
+		v := dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+	// br < 4, so uint8 is fine
+	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	for bitsLeft > 0 {
+		if br.bitsRead >= 64-8 {
+			for br.off > 0 {
+				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+				br.bitsRead -= 8
+				br.off--
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := dt[br.peekByteFast()>>shift]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= int8(nBits)
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
+	var br bitReaderBytes
+	err := br.init(src)
+	if err != nil {
+		return dst, err
+	}
+	maxDecodedSize := cap(dst)
+	dst = dst[:0]
+	// Avoid bounds check by always having full sized table.
+	dt := d.dt.single[:256]
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	const shift = 0
+	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
+	for br.off >= 4 {
+		br.fillFast()
+		v := dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+0] = uint8(v.entry >> 8)
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+1] = uint8(v.entry >> 8)
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+2] = uint8(v.entry >> 8)
+		v = dt[br.peekByteFast()>>shift]
+		br.advance(uint8(v.entry))
+		buf[off+3] = uint8(v.entry >> 8)
+		off += 4
+		if off == 0 {
+			if len(dst)+256 > maxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			dst = append(dst, buf[:]...)
+		}
+	}
+	if len(dst)+int(off) > maxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	dst = append(dst, buf[:off]...)
+	// br < 4, so uint8 is fine
+	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	for bitsLeft > 0 {
+		if br.bitsRead >= 64-8 {
+			for br.off > 0 {
+				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+				br.bitsRead -= 8
+				br.off--
+			}
+		}
+		if len(dst) >= maxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		v := dt[br.peekByteFast()>>shift]
+		nBits := uint8(v.entry)
+		br.advance(nBits)
+		bitsLeft -= int8(nBits)
+		dst = append(dst, uint8(v.entry>>8))
+	}
+	return dst, br.close()
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+	var br [4]bitReaderShifted
 	start := 6
 	for i := 0; i < 3; i++ {
-		length := int(in[i*2]) | (int(in[i*2+1]) << 8)
-		if start+length >= len(in) {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
 			return nil, errors.New("truncated input (or invalid offset)")
-		err = br[i].init(in[start : start+length])
+		err := br[i].init(src[start : start+length])
 		if err != nil {
 			return nil, err
 		start += length
-	err = br[3].init(in[start:])
+	err := br[3].init(src[start:])
 	if err != nil {
 		return nil, err
-	// Prepare output
-	if cap(s.Out) < dstSize {
-		s.Out = make([]byte, 0, dstSize)
-	}
-	s.Out = s.Out[:dstSize]
 	// destination, offset to match first output
-	dstOut := s.Out
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
 	dstEvery := (dstSize + 3) / 4
 	const tlSize = 1 << tableLogMax
 	const tlMask = tlSize - 1
-	single := s.dt.single[:tlSize]
-	decode := func(br *bitReader) byte {
-		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
-		v := single[val&tlMask]
-		br.bitsRead += uint8(v.entry)
-		return uint8(v.entry >> 8)
-	}
+	single := d.dt.single[:tlSize]
 	// Use temp table to avoid bound checks/append penalty.
-	var tmp = s.huffWeight[:256]
+	var buf [256]byte
 	var off uint8
 	var decoded int
 	// Decode 2 values from each decoder/loop.
 	const bufoff = 256 / 4
 	for {
-		for i := range br {
-			br := &br[i]
-			if br.off < 4 {
-				break bigloop
-			}
-			br.fillFast()
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
 			const stream = 0
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
-		}
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-		{
-			const stream = 1
-			val := br[stream].peekBitsFast(s.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
-			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v2 = single[val2&tlMask]
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 			const stream = 2
-			val := br[stream].peekBitsFast(s.actualTableLog)
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+			val := br[stream].peekBitsFast(d.actualTableLog)
 			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
 			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
-		}
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-		{
-			const stream = 3
-			val := br[stream].peekBitsFast(s.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].bitsRead += uint8(v.entry)
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			br[stream].advance(uint8(v.entry))
+			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-			val2 := br[stream].peekBitsFast(s.actualTableLog)
-			v2 := single[val2&tlMask]
-			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
-			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
-			br[stream].bitsRead += uint8(v2.entry)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v2 = single[val2&tlMask]
+			br[stream2].advance(uint8(v2.entry))
+			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
 		off += 2
@@ -350,42 +598,67 @@ bigloop:
 			if bufoff > dstEvery {
 				return nil, errors.New("corruption detected: stream overrun 1")
-			copy(dstOut, tmp[:bufoff])
-			copy(dstOut[dstEvery:], tmp[bufoff:bufoff*2])
-			copy(dstOut[dstEvery*2:], tmp[bufoff*2:bufoff*3])
-			copy(dstOut[dstEvery*3:], tmp[bufoff*3:bufoff*4])
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
 			off = 0
-			dstOut = dstOut[bufoff:]
+			out = out[bufoff:]
 			decoded += 256
 			// There must at least be 3 buffers left.
-			if len(dstOut) < dstEvery*3 {
+			if len(out) < dstEvery*3 {
 				return nil, errors.New("corruption detected: stream overrun 2")
 	if off > 0 {
 		ioff := int(off)
-		if len(dstOut) < dstEvery*3+ioff {
+		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
-		copy(dstOut, tmp[:off])
-		copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2])
-		copy(dstOut[dstEvery*2:dstEvery*2+ioff], tmp[bufoff*2:bufoff*3])
-		copy(dstOut[dstEvery*3:dstEvery*3+ioff], tmp[bufoff*3:bufoff*4])
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
 		decoded += int(off) * 4
-		dstOut = dstOut[off:]
+		out = out[off:]
 	// Decode remaining.
 	for i := range br {
 		offset := dstEvery * i
 		br := &br[i]
-		for !br.finished() {
+		bitsLeft := br.off*8 + uint(64-br.bitsRead)
+		for bitsLeft > 0 {
-			if offset >= len(dstOut) {
+			if false && br.bitsRead >= 32 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value = (br.value << 32) | uint64(low)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value = (br.value << 8) | uint64(br.in[br.off-1])
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
 				return nil, errors.New("corruption detected: stream overrun 4")
-			dstOut[offset] = decode(br)
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
 		decoded += offset - dstEvery*i
@@ -397,7 +670,426 @@ bigloop:
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
-	return s.Out, nil
+	return dst, nil
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
+	if d.actualTableLog == 8 {
+		return d.decompress4X8bitExactly(dst, src)
+	}
+	var br [4]bitReaderBytes
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+	shift := (8 - d.actualTableLog) & 7
+	const tlSize = 1 << 8
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	var decoded int
+	// Decode 4 values from each decoder/loop.
+	const bufoff = 256 / 4
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+		{
+			// Interleave 2 decodes.
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+		off += 4
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+			off = 0
+			out = out[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		for bitsLeft > 0 {
+			if br.finished() {
+				return nil, io.ErrUnexpectedEOF
+			}
+			if br.bitsRead >= 56 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value |= uint64(low) << (br.bitsRead - 32)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+			// Read value and increment offset.
+			v := single[br.peekByteFast()>>shift].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= int(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
+	var br [4]bitReaderBytes
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+	const shift = 0
+	const tlSize = 1 << 8
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+	// Use temp table to avoid bound checks/append penalty.
+	var buf [256]byte
+	var off uint8
+	var decoded int
+	// Decode 4 values from each decoder/loop.
+	const bufoff = 256 / 4
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+		{
+			// Interleave 2 decodes.
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+			v := single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 := single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+1] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+2] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+			v = single[br[stream].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream+3] = uint8(v >> 8)
+			br[stream].advance(uint8(v))
+			v2 = single[br[stream2].peekByteFast()>>shift].entry
+			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
+			br[stream2].advance(uint8(v2))
+		}
+		off += 4
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[:bufoff])
+			copy(out[dstEvery:], buf[bufoff:bufoff*2])
+			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
+			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
+			off = 0
+			out = out[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[:off])
+		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
+		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
+		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		for bitsLeft > 0 {
+			if br.finished() {
+				return nil, io.ErrUnexpectedEOF
+			}
+			if br.bitsRead >= 56 {
+				if br.off >= 4 {
+					v := br.in[br.off-4:]
+					v = v[:4]
+					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+					br.value |= uint64(low) << (br.bitsRead - 32)
+					br.bitsRead -= 32
+					br.off -= 4
+				} else {
+					for br.off > 0 {
+						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
+						br.bitsRead -= 8
+						br.off--
+					}
+				}
+			}
+			// end inline...
+			if offset >= len(out) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+			// Read value and increment offset.
+			v := single[br.peekByteFast()>>shift].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= int(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
 // matches will compare a decoding table to a coding table.
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 53249df056..5dd66854b0 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -55,6 +55,9 @@ const (
 	// ReusePolicyNone will disable re-use of tables.
 	// This is slightly faster than ReusePolicyAllow but may produce larger output.
+	// ReusePolicyMust must allow reuse and produce smaller output.
+	ReusePolicyMust
 type Scratch struct {
@@ -79,6 +82,13 @@ type Scratch struct {
 	// Slice of the returned data.
 	OutData []byte
+	// MaxDecodedSize will set the maximum allowed output size.
+	// This value will automatically be set to BlockSizeMax if not set.
+	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
+	MaxDecodedSize int
+	br byteReader
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
@@ -95,12 +105,6 @@ type Scratch struct {
 	// If WantLogLess == 0 any improvement will do.
 	WantLogLess uint8
-	// MaxDecodedSize will set the maximum allowed output size.
-	// This value will automatically be set to BlockSizeMax if not set.
-	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
-	MaxDecodedSize int
-	br             byteReader
 	symbolLen      uint16 // Length of active part of the symbol table.
 	maxCount       int    // count of the most probable symbol
 	clearCount     bool   // clear count
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index bc977a3023..ac3640dc90 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -5,11 +5,9 @@ It offers a very wide range of compression / speed trade-off, while being backed
 A high performance compression algorithm is implemented. For now focused on speed. 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
-Note that custom dictionaries are not supported yet, so if your code relies on that, 
-you cannot use the package as-is.
+Note that custom dictionaries are only supported for decompression.
 This package is pure Go and without use of "unsafe". 
-If a significant speedup can be achieved using "unsafe", it may be added as an option later.
 The `zstd` package is provided as open source software using a Go standard license.
@@ -142,80 +140,96 @@ Using the Encoder for both a stream and individual blocks concurrently is safe.
 I have collected some speed examples to compare speed and compression against other compressors.
 * `file` is the input file.
-* `out` is the compressor used. `zskp` is this package. `gzstd` is gzip standard library. `zstd` is the Datadog cgo library.
+* `out` is the compressor used. `zskp` is this package. `zstd` is the Datadog cgo library. `gzstd/gzkp` is gzip standard and this library.
 * `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default".
 * `insize`/`outsize` is the input/output size.
 * `millis` is the number of milliseconds used for compression.
 * `mb/s` is megabytes (2^20 bytes) per second.
-The test data for the Large Text Compression Benchmark is the first
-10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
+Silesia Corpus:
-file    out     level   insize  outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343833033   5840    163.30
-enwik9  zskp    2   1000000000  317822183   8449    112.87
-enwik9  gzstd   1   1000000000  382578136   13627   69.98
-enwik9  gzstd   3   1000000000  349139651   22344   42.68
-enwik9  zstd    1   1000000000  357416379   4838    197.12
-enwik9  zstd    3   1000000000  313734522   7556    126.21
+This package:
+file    out     level   insize      outsize     millis  mb/s
+silesia.tar zskp    1   211947520   73101992    643     313.87
+silesia.tar zskp    2   211947520   67504318    969     208.38
+silesia.tar zskp    3   211947520   65177448    1899    106.44
+cgo zstd:
+silesia.tar zstd    1   211947520   73605392    543     371.56
+silesia.tar zstd    3   211947520   66793289    864     233.68
+silesia.tar zstd    6   211947520   62916450    1913    105.66
+gzip, stdlib/this package:
+silesia.tar gzstd   1   211947520   80007735    1654    122.21
+silesia.tar gzkp    1   211947520   80369488    1168    173.06
 GOB stream of binary data. Highly compressible.
-file        out level   insize      outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  234981983   5100    357.42
-gob-stream  zskp    2   1911399616  208674003   6698    272.15
-gob-stream  gzstd   1   1911399616  357382641   14727   123.78
-gob-stream  gzstd   3   1911399616  327835097   17005   107.19
-gob-stream  zstd    1   1911399616  250787165   4075    447.22
-gob-stream  zstd    3   1911399616  208191888   5511    330.77
+file        out     level   insize  outsize     millis  mb/s
+gob-stream  zskp    1   1911399616  235022249   3088    590.30
+gob-stream  zskp    2   1911399616  205669791   3786    481.34
+gob-stream  zskp    3   1911399616  185792019   9324    195.48
+gob-stream  zstd    1   1911399616  249810424   2637    691.26
+gob-stream  zstd    3   1911399616  208192146   3490    522.31
+gob-stream  zstd    6   1911399616  193632038   6687    272.56
+gob-stream  gzstd   1   1911399616  357382641   10251   177.82
+gob-stream  gzkp    1   1911399616  362156523   5695    320.08
-Highly compressible JSON file. Similar to logs in a lot of ways.
+The test data for the Large Text Compression Benchmark is the first
+10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
-file            out level   insize      outsize     millis  mb/s
-adresser.001    zskp    1   1073741824  18510122    1477    692.83
-adresser.001    zskp    2   1073741824  19831697    1705    600.59
-adresser.001    gzstd   1   1073741824  47755503    3079    332.47
-adresser.001    gzstd   3   1073741824  40052381    3051    335.63
-adresser.001    zstd    1   1073741824  16135896    994     1030.18
-adresser.001    zstd    3   1073741824  17794465    905     1131.49
+file    out level   insize      outsize     millis  mb/s
+enwik9  zskp    1   1000000000  343848582   3609    264.18
+enwik9  zskp    2   1000000000  317276632   5746    165.97
+enwik9  zskp    3   1000000000  294540704   11725   81.34
+enwik9  zstd    1   1000000000  358072021   3110    306.65
+enwik9  zstd    3   1000000000  313734672   4784    199.35
+enwik9  zstd    6   1000000000  295138875   10290   92.68
+enwik9  gzstd   1   1000000000  382578136   9604    99.30
+enwik9  gzkp    1   1000000000  383825945   6544    145.73
+Highly compressible JSON file.
+file                        out level   insize      outsize     millis  mb/s
+github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
+github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
+github-june-2days-2019.json zskp    3   6273951764  537511906   29252   204.54
+github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
+github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
+github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
+github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
+github-june-2days-2019.json gzkp    1   6273951764  1128755542  19236   311.03
 VM Image, Linux mint with a few installed applications:
-file    out level   insize  outsize millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3648168838  33398   244.38
-rawstudio-mint14.tar    zskp    2   8558382592  3376721436  50962   160.16
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  84712   96.35
-rawstudio-mint14.tar    gzstd   3   8558382592  3740711978  176344  46.28
-rawstudio-mint14.tar    zstd    1   8558382592  3607859742  27903   292.51
-rawstudio-mint14.tar    zstd    3   8558382592  3341710879  46700   174.77
+file                    out level   insize      outsize     millis  mb/s
+rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
+rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
+rawstudio-mint14.tar    zskp    3   8558382592  3224594213  71751   113.75
+rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
+rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
+rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
+rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
+rawstudio-mint14.tar    gzkp    1   8558382592  3970463184  41749   195.49
+CSV data:
-The test data is designed to test archivers in realistic backup scenarios.
-file    out level   insize  outsize millis  mb/s
-10gb.tar    zskp    1   10065157632 4883149814  45715   209.97
-10gb.tar    zskp    2   10065157632 4638110010  60970   157.44
-10gb.tar    gzstd   1   10065157632 5198296126  97769   98.18
-10gb.tar    gzstd   3   10065157632 4932665487  313427  30.63
-10gb.tar    zstd    1   10065157632 4940796535  40391   237.65
-10gb.tar    zstd    3   10065157632 4638618579  52911   181.42
-Silesia Corpus:
-file    out level   insize  outsize millis  mb/s
-silesia.tar zskp    1   211947520   73025800    1108    182.26
-silesia.tar zskp    2   211947520   67674684    1599    126.41
-silesia.tar gzstd   1   211947520   80007735    2515    80.37
-silesia.tar gzstd   3   211947520   73133380    4259    47.45
-silesia.tar zstd    1   211947520   73513991    933     216.64
-silesia.tar zstd    3   211947520   66793301    1377    146.79
+file                    out level   insize      outsize     millis  mb/s
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
+nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
+nyc-taxi-data-10M.csv   zskp    3   3325605752  538490114   19880   159.53
+nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
+nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
+nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  924718719   16388   193.53
 ### Converters
@@ -309,6 +323,20 @@ The decoder can be used for *concurrent* decompression of multiple buffers.
 It will only allow a certain number of concurrent operations to run. 
 To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
+### Dictionaries
+Data compressed with [dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression) can be decompressed.
+Dictionaries are added individually to Decoders.
+Dictionaries are generated by the `zstd --train` command and contains an initial state for the decoder.
+To add a dictionary use the `WithDecoderDicts(dicts ...[]byte)` option with the dictionary data.
+Several dictionaries can be added at once.
+The dictionary will be used automatically for the data that specifies them.
+A re-used Decoder will still contain the dictionaries registered.
+When registering multiple dictionaries with the same ID, the last one will be used.
 ### Allocation-less operation
 The decoder has been designed to operate without allocations after a warmup. 
@@ -350,36 +378,42 @@ These are some examples of performance compared to [datadog cgo library](https:/
 The first two are streaming decodes and the last are smaller inputs. 
-BenchmarkDecoderSilesia-8             20       642550210 ns/op   329.85 MB/s      3101 B/op        8 allocs/op
-BenchmarkDecoderSilesiaCgo-8         100       384930000 ns/op   550.61 MB/s    451878 B/op     9713 allocs/op
+BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
+BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
-BenchmarkDecoderEnwik9-2              10        3146000080 ns/op         317.86 MB/s        2649 B/op          9 allocs/op
-BenchmarkDecoderEnwik9Cgo-2           20        1905900000 ns/op         524.69 MB/s     1125120 B/op      45785 allocs/op
+BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
+BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-BenchmarkDecoder_DecodeAll/z000000.zst-8               200     7049994 ns/op   138.26 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000001.zst-8            100000       19560 ns/op    97.49 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000002.zst-8              5000      297599 ns/op   236.99 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000003.zst-8              2000      725502 ns/op   141.17 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000004.zst-8            200000        9314 ns/op    54.54 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000005.zst-8             10000      137500 ns/op   104.72 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000006.zst-8               500     2316009 ns/op   206.06 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000007.zst-8             20000       64499 ns/op   344.90 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000008.zst-8             50000       24900 ns/op   219.56 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAll/z000009.zst-8              1000     2348999 ns/op   154.01 MB/s        40 B/op        2 allocs/op
+Concurrent performance:
-BenchmarkDecoder_DecodeAllCgo/z000000.zst-8            500     4268005 ns/op   228.38 MB/s   1228849 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000001.zst-8         100000       15250 ns/op   125.05 MB/s      2096 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000002.zst-8          10000      147399 ns/op   478.49 MB/s     73776 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000003.zst-8           5000      320798 ns/op   319.27 MB/s    139312 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000004.zst-8         200000       10004 ns/op    50.77 MB/s       560 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000005.zst-8          20000       73599 ns/op   195.64 MB/s     19120 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000006.zst-8           1000     1119003 ns/op   426.48 MB/s    557104 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000007.zst-8          20000      103450 ns/op   215.04 MB/s     71296 B/op        9 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000008.zst-8         100000       20130 ns/op   271.58 MB/s      6192 B/op        3 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000009.zst-8           2000     1123500 ns/op   322.00 MB/s    368688 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
+BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
-This reflects the performance around May 2019, but this may be out of date.
+This reflects the performance around May 2020, but this may be out of date.
 # Contributions
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 15d79d439f..8544585371 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -5,6 +5,7 @@
 package zstd
 import (
+	"encoding/binary"
@@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
-	b.fill()
+	if len(in) >= 8 {
+		b.fillFastStart()
+	} else {
+		b.fill()
+		b.fill()
+	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
@@ -63,21 +68,31 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
-	// Do single re-slice to avoid bounds checks.
-	v := b.in[b.off-4 : b.off]
+	// 2 bounds checks.
+	v := b.in[b.off-4:]
+	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
+// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
+func (b *bitReader) fillFastStart() {
+	// Do single re-slice to avoid bounds checks.
+	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.bitsRead = 0
+	b.off -= 8
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 	if b.off >= 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
+		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index ed670bcc7a..c8ec6e3312 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -75,21 +75,29 @@ type blockDec struct {
 	// Window size of the block.
 	WindowSize uint64
-	Type       blockType
-	RLESize    uint32
+	history     chan *history
+	input       chan struct{}
+	result      chan decodeOutput
+	sequenceBuf []seq
+	err         error
+	decWG       sync.WaitGroup
+	// Frame to use for singlethreaded decoding.
+	// Should not be used by the decoder itself since parent may be another frame.
+	localFrame *frameDec
+	// Block is RLE, this is the size.
+	RLESize uint32
+	tmp     [4]byte
+	Type blockType
 	// Is this the last block of a frame?
 	Last bool
 	// Use less memory
-	lowMem      bool
-	history     chan *history
-	input       chan struct{}
-	result      chan decodeOutput
-	sequenceBuf []seq
-	tmp         [4]byte
-	err         error
-	decWG       sync.WaitGroup
+	lowMem bool
 func (b *blockDec) String() string {
@@ -127,25 +135,37 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
+	maxSize := maxBlockSize
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
 		b.RLESize = uint32(cSize)
+		if b.lowMem {
+			maxSize = cSize
+		}
 		cSize = 1
 	case blockTypeCompressed:
 		if debug {
 			println("Data size on stream:", cSize)
 		b.RLESize = 0
+		maxSize = maxCompressedBlockSize
+		if windowSize < maxCompressedBlockSize && b.lowMem {
+			maxSize = int(windowSize)
+		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
 			if debug {
 				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
 			return ErrCompressedSizeTooBig
-	default:
+	case blockTypeRaw:
 		b.RLESize = 0
+		// We do not need a destination for raw blocks.
+		maxSize = -1
+	default:
+		panic("Invalid block type")
 	// Read block data.
@@ -156,8 +176,8 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			b.dataStorage = make([]byte, 0, maxBlockSize)
-	if cap(b.dst) <= maxBlockSize {
-		b.dst = make([]byte, 0, maxBlockSize+1)
+	if cap(b.dst) <= maxSize {
+		b.dst = make([]byte, 0, maxSize+1)
 	var err error
 	b.data, err = br.readBig(cSize, b.dataStorage)
@@ -445,26 +465,22 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		if huff == nil {
 			huff = &huff0.Scratch{}
-		huff.Out = b.literalBuf[:0]
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
 			return err
 		// Use our out buffer.
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		if err != nil {
 			println("decoding compressed literals:", err)
 			return err
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if len(literals) != litRegenSize {
 			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
@@ -615,15 +631,12 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		var err error
 		// Use our out buffer.
 		huff = hist.huffTree
-		huff.Out = b.literalBuf[:0]
-		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		// Make sure we don't leak our literals buffer
-		huff.Out = nil
 		if err != nil {
 			println("decompressing literals:", err)
 			return err
@@ -633,12 +646,13 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	} else {
 		if hist.huffTree != nil && huff != nil {
-			huffDecoderPool.Put(hist.huffTree)
+			if hist.dict == nil || hist.dict.litDec != hist.huffTree {
+				huffDecoderPool.Put(hist.huffTree)
+			}
 			hist.huffTree = nil
 	if huff != nil {
-		huff.Out = nil
 		hist.huffTree = huff
 	if debug {
@@ -671,12 +685,21 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	//   If only recent offsets were not transferred, this would be an obvious win.
 	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+	hbytes := hist.b
+	if len(hbytes) > hist.windowSize {
+		hbytes = hbytes[len(hbytes)-hist.windowSize:]
+		// We do not need history any more.
+		if hist.dict != nil {
+			hist.dict.content = nil
+		}
+	}
 	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
 		println("initializing sequences:", err)
 		return err
-	err = seqs.decode(nSeqs, br, hist.b)
+	err = seqs.decode(nSeqs, br, hbytes)
 	if err != nil {
 		return err
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index 4f0eba22f0..be718afd43 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -295,7 +295,7 @@ func (b *blockEnc) encodeRaw(a []byte) {
 	b.output = bh.appendTo(b.output[:0])
 	b.output = append(b.output, a...)
 	if debug {
-		println("Adding RAW block, length", len(a))
+		println("Adding RAW block, length", len(a), "last:", b.last)
@@ -308,7 +308,7 @@ func (b *blockEnc) encodeRawTo(dst, src []byte) []byte {
 	dst = bh.appendTo(dst)
 	dst = append(dst, src...)
 	if debug {
-		println("Adding RAW block, length", len(src))
+		println("Adding RAW block, length", len(src), "last:", b.last)
 	return dst
@@ -322,7 +322,7 @@ func (b *blockEnc) encodeLits(raw bool) error {
 	// Don't compress extremely small blocks
 	if len(b.literals) < 32 || raw {
 		if debug {
-			println("Adding RAW block, length", len(b.literals))
+			println("Adding RAW block, length", len(b.literals), "last:", b.last)
 		b.output = bh.appendTo(b.output)
@@ -349,7 +349,7 @@ func (b *blockEnc) encodeLits(raw bool) error {
 	switch err {
 	case huff0.ErrIncompressible:
 		if debug {
-			println("Adding RAW block, length", len(b.literals))
+			println("Adding RAW block, length", len(b.literals), "last:", b.last)
 		b.output = bh.appendTo(b.output)
@@ -444,9 +444,9 @@ func fuzzFseEncoder(data []byte) int {
 // encode will encode the block and append the output in b.output.
-func (b *blockEnc) encode(raw bool) error {
+func (b *blockEnc) encode(raw, rawAllLits bool) error {
 	if len(b.sequences) == 0 {
-		return b.encodeLits(raw)
+		return b.encodeLits(rawAllLits)
 	// We want some difference
 	if len(b.literals) > (b.size - (b.size >> 5)) {
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
index dc4378b640..2c4fca17fa 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -31,7 +31,8 @@ func (b *byteReader) overread() bool {
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
@@ -55,7 +56,20 @@ func (b byteReader) Uint32() uint32 {
 		return v
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
+	v3 := uint32(b2[3])
+	v2 := uint32(b2[2])
+	v1 := uint32(b2[1])
+	v0 := uint32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+// Uint32NC returns a little endian uint32 starting at current offset.
+// The caller must be sure if there are at least 4 bytes left.
+func (b byteReader) Uint32NC() uint32 {
+	b2 := b.b[b.off:]
+	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 73ac3c630e..66b51bf2d3 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -23,17 +23,15 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
-	// Unreferenced decoders, ready for use.
-	frames chan *frameDec
 	// Streams ready to be decoded.
 	stream chan decodeStream
 	// Current read position used for Reader functionality.
 	current decoderState
-	// Custom dictionaries
-	dicts map[uint32]struct{}
+	// Custom dictionaries.
+	// Always uses copies.
+	dicts map[uint32]dict
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -66,7 +64,7 @@ var (
 // A Decoder can be used in two modes:
 // 1) As a stream, or
-// 2) For stateless decoding using DecodeAll or DecodeBuffer.
+// 2) For stateless decoding using DecodeAll.
 // Only a single stream can be decoded concurrently, but the same decoder
 // can run multiple concurrent stateless decodes. It is even possible to
@@ -87,12 +85,19 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	d.current.output = make(chan decodeOutput, d.o.concurrent)
 	d.current.flushed = true
+	// Transfer option dicts.
+	d.dicts = make(map[uint32]dict, len(d.o.dicts))
+	for _, dc := range d.o.dicts {
+		d.dicts[dc.id] = dc
+	}
+	d.o.dicts = nil
 	// Create decoders
 	d.decoders = make(chan *blockDec, d.o.concurrent)
-	d.frames = make(chan *frameDec, d.o.concurrent)
 	for i := 0; i < d.o.concurrent; i++ {
-		d.frames <- newFrameDec(d.o)
-		d.decoders <- newBlockDec(d.o.lowMem)
+		dec := newBlockDec(d.o.lowMem)
+		dec.localFrame = newFrameDec(d.o)
+		d.decoders <- dec
 	if r == nil {
@@ -169,7 +174,12 @@ func (d *Decoder) Reset(r io.Reader) error {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		b := bb.Bytes()
-		dst, err := d.DecodeAll(b, nil)
+		var dst []byte
+		if cap(d.current.b) > 0 {
+			dst = d.current.b
+		}
+		dst, err := d.DecodeAll(b, dst[:0])
 		if err == nil {
 			err = io.EOF
@@ -277,23 +287,31 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	// Grab a block decoder and frame decoder.
-	block, frame := <-d.decoders, <-d.frames
+	block := <-d.decoders
+	frame := block.localFrame
 	defer func() {
 		if debug {
 			printf("re-adding decoder: %p", block)
-		d.decoders <- block
 		frame.rawInput = nil
 		frame.bBuf = nil
-		d.frames <- frame
+		d.decoders <- block
 	frame.bBuf = input
 	for {
+		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
 		if err == io.EOF {
 			return dst, nil
+		if frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				return nil, ErrUnknownDictionary
+			}
+			frame.history.setDict(&dict)
+		}
 		if err != nil {
 			return dst, err
@@ -456,10 +474,19 @@ func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
 		br := readerWrapper{r: stream.r}
 		for {
+			frame.history.reset()
 			err := frame.reset(&br)
 			if debug && err != nil {
 				println("Frame decoder returned", err)
+			if err == nil && frame.DictionaryID != nil {
+				dict, ok := d.dicts[*frame.DictionaryID]
+				if !ok {
+					err = ErrUnknownDictionary
+				} else {
+					frame.history.setDict(&dict)
+				}
+			}
 			if err != nil {
 				stream.output <- decodeOutput{
 					err: err,
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 2ac9cd2dd3..284d384492 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -18,6 +18,7 @@ type decoderOptions struct {
 	lowMem         bool
 	concurrent     int
 	maxDecodedSize uint64
+	dicts          []dict
 func (o *decoderOptions) setDefault() {
@@ -66,3 +67,18 @@ func WithDecoderMaxMemory(n uint64) DOption {
 		return nil
+// WithDecoderDicts allows to register one or more dictionaries for the decoder.
+// If several dictionaries with the same ID is provided the last one will be used.
+func WithDecoderDicts(dicts ...[]byte) DOption {
+	return func(o *decoderOptions) error {
+		for _, b := range dicts {
+			d, err := loadDict(b)
+			if err != nil {
+				return err
+			}
+			o.dicts = append(o.dicts, *d)
+		}
+		return nil
+	}
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
new file mode 100644
index 0000000000..8eb6f6ba33
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -0,0 +1,104 @@
+package zstd
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"github.com/klauspost/compress/huff0"
+type dict struct {
+	id uint32
+	litDec              *huff0.Scratch
+	llDec, ofDec, mlDec sequenceDec
+	offsets             [3]int
+	content             []byte
+var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+// Load a dictionary as described in
+// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+func loadDict(b []byte) (*dict, error) {
+	// Check static field size.
+	if len(b) <= 8+(3*4) {
+		return nil, io.ErrUnexpectedEOF
+	}
+	d := dict{
+		llDec: sequenceDec{fse: &fseDecoder{}},
+		ofDec: sequenceDec{fse: &fseDecoder{}},
+		mlDec: sequenceDec{fse: &fseDecoder{}},
+	}
+	if !bytes.Equal(b[:4], dictMagic[:]) {
+		return nil, ErrMagicMismatch
+	}
+	d.id = binary.LittleEndian.Uint32(b[4:8])
+	if d.id == 0 {
+		return nil, errors.New("dictionaries cannot have ID 0")
+	}
+	// Read literal table
+	var err error
+	d.litDec, b, err = huff0.ReadTable(b[8:], nil)
+	if err != nil {
+		return nil, err
+	}
+	br := byteReader{
+		b:   b,
+		off: 0,
+	}
+	readDec := func(i tableIndex, dec *fseDecoder) error {
+		if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil {
+			return err
+		}
+		if br.overread() {
+			return io.ErrUnexpectedEOF
+		}
+		err = dec.transform(symbolTableX[i])
+		if err != nil {
+			println("Transform table error:", err)
+			return err
+		}
+		if debug {
+			println("Read table ok", "symbolLen:", dec.symbolLen)
+		}
+		// Set decoders as predefined so they aren't reused.
+		dec.preDefined = true
+		return nil
+	}
+	if err := readDec(tableOffsets, d.ofDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil {
+		return nil, err
+	}
+	if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil {
+		return nil, err
+	}
+	if br.remain() < 12 {
+		return nil, io.ErrUnexpectedEOF
+	}
+	d.offsets[0] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[1] = int(br.Uint32())
+	br.advance(4)
+	d.offsets[2] = int(br.Uint32())
+	br.advance(4)
+	if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 {
+		return nil, errors.New("invalid offset in dictionary")
+	}
+	d.content = make([]byte, br.remain())
+	copy(d.content, br.unread())
+	if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) {
+		return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets)
+	}
+	return &d, nil
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
new file mode 100644
index 0000000000..c120d90548
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -0,0 +1,518 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+package zstd
+import "fmt"
+const (
+	betterLongTableBits = 19                       // Bits used in the long match table
+	betterLongTableSize = 1 << betterLongTableBits // Size of the table
+	// Note: Increasing the short table bits or making the hash shorter
+	// can actually lead to compression degradation since it will 'steal' more from the
+	// long match table and match offsets are quite big.
+	// This greatly depends on the type of input.
+	betterShortTableBits = 13                        // Bits used in the short match table
+	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+type prevEntry struct {
+	offset int32
+	prev   int32
+// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type betterFastEncoder struct {
+	fastBase
+	table     [betterShortTableSize]tableEntry
+	longTable [betterLongTableSize]prevEntry
+// Encode improves compression...
+func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+	const kSearchStrength = 9
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+			coffsetS := candidateS.offset - e.cur
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+		// Extend the n-byte match as long as possible.
+		l := matched
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			index0 += 2
+		}
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.Encode(blk, src)
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 0ffea76554..50276bcde7 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -80,10 +80,7 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
-	stepSize := int32(e.o.targetLength)
-	if stepSize == 0 {
-		stepSize++
-	}
+	const stepSize = 1
 	const kSearchStrength = 8
@@ -172,55 +169,6 @@ encodeLoop:
 					cv = load6432(src, s)
-				const repOff2 = 1
-				// We deviate from the reference encoder and also check offset 2.
-				// Slower and not consistently better, so disabled.
-				// repIndex = s - offset2 + repOff2
-				if false && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff2*8)) {
-					// Consider history as well.
-					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff2, repIndex+4, src)
-					seq.matchLen = uint32(lenght - zstdMinMatch)
-					// We might be able to match backwards.
-					// Extend as long as we can.
-					start := s + repOff2
-					// We end the search early, so we don't risk 0 literals
-					// and have to do special offset treatment.
-					startLimit := nextEmit + 1
-					tMin := s - e.maxMatchOff
-					if tMin < 0 {
-						tMin = 0
-					}
-					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-						repIndex--
-						start--
-						seq.matchLen++
-					}
-					addLiterals(&seq, start)
-					// rep 2
-					seq.offset = 2
-					if debugSequences {
-						println("repeat sequence 2", seq, "next s:", s)
-					}
-					blk.sequences = append(blk.sequences, seq)
-					s += lenght + repOff2
-					nextEmit = s
-					if s >= sLimit {
-						if debug {
-							println("repeat ended", s, lenght)
-						}
-						break encodeLoop
-					}
-					cv = load6432(src, s)
-					// Swap offsets
-					offset1, offset2 = offset2, offset1
-					continue
-				}
 			// Find the offsets of our two matches.
 			coffsetL := s - (candidateL.offset - e.cur)
@@ -372,7 +320,7 @@ encodeLoop:
 			// Store this, since we have it.
-			nextHashS := hash5(cv1>>8, dFastShortTableBits)
+			nextHashS := hash5(cv, dFastShortTableBits)
 			nextHashL := hash8(cv, dFastLongTableBits)
 			// We have at least 4 byte match.
@@ -450,10 +398,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
-	stepSize := int32(e.o.targetLength)
-	if stepSize == 0 {
-		stepSize++
-	}
+	const stepSize = 1
 	const kSearchStrength = 8
@@ -726,4 +671,8 @@ encodeLoop:
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	// We do not store history, so we must offset e.cur to avoid false matches for next user.
+	if e.cur < bufferReset {
+		e.cur += int32(len(src))
+	}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 28134b1589..4104b456ce 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -6,6 +6,7 @@ package zstd
 import (
+	"math"
@@ -23,26 +24,29 @@ type tableEntry struct {
 	offset int32
-type fastEncoder struct {
-	o encParams
+type fastBase struct {
 	// cur is the offset at the start of hist
 	cur int32
 	// maximum offset. Should be at least 2x block size.
 	maxMatchOff int32
 	hist        []byte
 	crc         *xxhash.Digest
-	table       [tableSize]tableEntry
 	tmp         [8]byte
 	blk         *blockEnc
+type fastEncoder struct {
+	fastBase
+	table [tableSize]tableEntry
 // CRC returns the underlying CRC writer.
-func (e *fastEncoder) CRC() *xxhash.Digest {
+func (e *fastBase) CRC() *xxhash.Digest {
 	return e.crc
 // AppendCRC will append the CRC to the destination slice and return it.
-func (e *fastEncoder) AppendCRC(dst []byte) []byte {
+func (e *fastBase) AppendCRC(dst []byte) []byte {
 	crc := e.crc.Sum(e.tmp[:0])
 	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
 	return dst
@@ -50,7 +54,7 @@ func (e *fastEncoder) AppendCRC(dst []byte) []byte {
 // WindowSize returns the window size of the encoder,
 // or a window size small enough to contain the input size, if > 0.
-func (e *fastEncoder) WindowSize(size int) int32 {
+func (e *fastBase) WindowSize(size int) int32 {
 	if size > 0 && size < int(e.maxMatchOff) {
 		b := int32(1) << uint(bits.Len(uint(size)))
 		// Keep minimum window.
@@ -63,7 +67,7 @@ func (e *fastEncoder) WindowSize(size int) int32 {
 // Block returns the current block.
-func (e *fastEncoder) Block() *blockEnc {
+func (e *fastBase) Block() *blockEnc {
 	return e.blk
@@ -112,11 +116,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 2.
-	stepSize := int32(e.o.targetLength)
-	if stepSize == 0 {
-		stepSize++
-	}
-	stepSize++
+	const stepSize = 2
 	const hashLog = tableBits
@@ -169,9 +169,22 @@ encodeLoop:
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				lenght := 4 + e.matchlen(s+6, repIndex+4, src)
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -197,11 +210,11 @@ encodeLoop:
 					println("repeat sequence", seq, "next s:", s)
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 					break encodeLoop
@@ -257,7 +270,20 @@ encodeLoop:
 		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -294,7 +320,20 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -344,6 +383,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 			panic("src too big")
 	// Protect against e.cur wraparound.
 	if e.cur >= bufferReset {
 		for i := range e.table[:] {
@@ -412,10 +452,23 @@ encodeLoop:
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// lenght := 4 + e.matchlen(s+6, repIndex+4, src)
-				lenght := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				// length := 4 + e.matchlen(s+6, repIndex+4, src)
+				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				var length int32
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -441,11 +494,11 @@ encodeLoop:
 					println("repeat sequence", seq, "next s:", s)
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 					break encodeLoop
@@ -464,6 +517,9 @@ encodeLoop:
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
+				if debugAsserts && t < 0 {
+					panic(fmt.Sprintf("t (%d) < 0, candidate.offset: %d, e.cur: %d, coffset0: %d, e.maxMatchOff: %d", t, candidate.offset, e.cur, coffset0, e.maxMatchOff))
+				}
@@ -496,9 +552,25 @@ encodeLoop:
 			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		if debugAsserts && t < 0 {
+			panic(fmt.Sprintf("t (%d) < 0 ", t))
+		}
 		// Extend the 4-byte match as long as possible.
 		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -536,7 +608,20 @@ encodeLoop:
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -569,9 +654,13 @@ encodeLoop:
 	if debug {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	// We do not store history, so we must offset e.cur to avoid false matches for next user.
+	if e.cur < bufferReset {
+		e.cur += int32(len(src))
+	}
-func (e *fastEncoder) addBlock(src []byte) int32 {
+func (e *fastBase) addBlock(src []byte) int32 {
 	if debugAsserts && e.cur > bufferReset {
 		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
@@ -602,17 +691,17 @@ func (e *fastEncoder) addBlock(src []byte) int32 {
 // useBlock will replace the block with the provided one,
 // but transfer recent offsets from the previous.
-func (e *fastEncoder) UseBlock(enc *blockEnc) {
+func (e *fastBase) UseBlock(enc *blockEnc) {
 	e.blk = enc
-func (e *fastEncoder) matchlenNoHist(s, t int32, src []byte) int32 {
+func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
 	// Extend the match to be as long as possible.
 	return int32(matchLen(src[s:], src[t:]))
-func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 {
+func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 	if debugAsserts {
 		if s < 0 {
 			err := fmt.Sprintf("s (%d) < 0", s)
@@ -626,18 +715,17 @@ func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 {
 			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
-	}
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
+		if len(src)-int(s) > maxCompressedBlockSize {
+			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
+		}
 	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:s1], src[t:]))
+	return int32(matchLen(src[s:], src[t:]))
 // Reset the encoding table.
-func (e *fastEncoder) Reset() {
+func (e *fastBase) Reset(singleBlock bool) {
 	if e.blk == nil {
 		e.blk = &blockEnc{}
@@ -650,7 +738,7 @@ func (e *fastEncoder) Reset() {
 	} else {
-	if cap(e.hist) < int(e.maxMatchOff*2) {
+	if !singleBlock && cap(e.hist) < int(e.maxMatchOff*2) {
 		l := e.maxMatchOff * 2
 		// Make it at least 1MB.
 		if l < 1<<20 {
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_params.go b/vendor/github.com/klauspost/compress/zstd/enc_params.go
index b6779ecb6d..d874116f71 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_params.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_params.go
@@ -4,6 +4,8 @@
 package zstd
+// encParams are not really used, just here for reference.
 type encParams struct {
 	// largest match distance : larger == more compression, more memory needed during decompression
 	windowLog uint8
@@ -152,3 +154,4 @@ var defEncParams = [4][]encParams{
 		{14, 15, 15, 10, 3, 999, strategyBtultra2}, // level 22.
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index 4032fb9fc8..95ebc3d84e 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -35,21 +35,22 @@ type encoder interface {
 	AppendCRC([]byte) []byte
 	WindowSize(size int) int32
-	Reset()
+	Reset(singleBlock bool)
 type encoderState struct {
-	w             io.Writer
-	filling       []byte
-	current       []byte
-	previous      []byte
-	encoder       encoder
-	writing       *blockEnc
-	err           error
-	writeErr      error
-	nWritten      int64
-	headerWritten bool
-	eofWritten    bool
+	w                io.Writer
+	filling          []byte
+	current          []byte
+	previous         []byte
+	encoder          encoder
+	writing          *blockEnc
+	err              error
+	writeErr         error
+	nWritten         int64
+	headerWritten    bool
+	eofWritten       bool
+	fullFrameWritten bool
 	// This waitgroup indicates an encode is running.
 	wg sync.WaitGroup
@@ -71,27 +72,26 @@ func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) {
 	if w != nil {
-	} else {
-		e.init.Do(func() {
-			e.initialize()
-		})
 	return &e, nil
 func (e *Encoder) initialize() {
+	if e.o.concurrent == 0 {
+		e.o.setDefault()
+	}
 	e.encoders = make(chan encoder, e.o.concurrent)
 	for i := 0; i < e.o.concurrent; i++ {
-		e.encoders <- e.o.encoder()
+		enc := e.o.encoder()
+		// If not single block, history will be allocated on first use.
+		enc.Reset(true)
+		e.encoders <- enc
 // Reset will re-initialize the writer and new writes will encode to the supplied writer
 // as a new, independent stream.
 func (e *Encoder) Reset(w io.Writer) {
-	e.init.Do(func() {
-		e.initialize()
-	})
 	s := &e.state
@@ -115,9 +115,10 @@ func (e *Encoder) Reset(w io.Writer) {
 	s.filling = s.filling[:0]
 	s.current = s.current[:0]
 	s.previous = s.previous[:0]
-	s.encoder.Reset()
+	s.encoder.Reset(false)
 	s.headerWritten = false
 	s.eofWritten = false
+	s.fullFrameWritten = false
 	s.w = w
 	s.err = nil
 	s.nWritten = 0
@@ -176,6 +177,23 @@ func (e *Encoder) nextBlock(final bool) error {
 		return fmt.Errorf("block > maxStoreBlockSize")
 	if !s.headerWritten {
+		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) > 0 {
+			s.current = e.EncodeAll(s.filling, s.current[:0])
+			var n2 int
+			n2, s.err = s.w.Write(s.current)
+			if s.err != nil {
+				return s.err
+			}
+			s.nWritten += int64(n2)
+			s.current = s.current[:0]
+			s.filling = s.filling[:0]
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
 		var tmp [maxHeaderSize]byte
 		fh := frameHeader{
 			ContentSize:   0,
@@ -263,7 +281,7 @@ func (e *Encoder) nextBlock(final bool) error {
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy)
+				err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
 			switch err {
 			case errIncompressible:
@@ -298,7 +316,9 @@ func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
 	src := e.state.filling
 	for {
 		n2, err := r.Read(src)
-		_, _ = e.state.encoder.CRC().Write(src[:n2])
+		if e.o.crc {
+			_, _ = e.state.encoder.CRC().Write(src[:n2])
+		}
 		// src is now the unfilled part...
 		src = src[n2:]
 		n += int64(n2)
@@ -363,6 +383,9 @@ func (e *Encoder) Close() error {
 	if err != nil {
 		return err
+	if e.state.fullFrameWritten {
+		return s.err
+	}
@@ -422,18 +445,14 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		return dst
-	e.init.Do(func() {
-		e.o.setDefault()
-		e.initialize()
-	})
+	e.init.Do(e.initialize)
 	enc := <-e.encoders
 	defer func() {
 		// Release encoder reference to last block.
-		enc.Reset()
+		// If a non-single block is needed the encoder will reset again.
+		enc.Reset(true)
 		e.encoders <- enc
-	enc.Reset()
-	blk := enc.Block()
 	// Use single segments when above minimum window and below 1MB.
 	single := len(src) < 1<<20 && len(src) > MinWindowSize
 	if e.o.single != nil {
@@ -456,12 +475,13 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
-	if len(src) <= e.o.blockSize && len(src) <= maxBlockSize {
+	// If we can do everything in one block, prefer that.
+	if len(src) <= maxCompressedBlockSize {
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
 			_, _ = enc.CRC().Write(src)
-		blk.reset(nil)
+		blk := enc.Block()
 		blk.last = true
 		enc.EncodeNoHist(blk, src)
@@ -472,7 +492,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
 			// Output directly to dst
 			blk.output = dst
-			err = blk.encode(e.o.noEntropy)
+			err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
 		switch err {
@@ -488,6 +508,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		blk.output = oldout
 	} else {
+		enc.Reset(false)
+		blk := enc.Block()
 		for len(src) > 0 {
 			todo := src
 			if len(todo) > e.o.blockSize {
@@ -507,7 +529,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy)
+				err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
 			switch err {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 40eb457331..dfac14ddde 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -12,15 +12,18 @@ type EOption func(*encoderOptions) error
 // options retains accumulated state of multiple options.
 type encoderOptions struct {
-	concurrent int
-	crc        bool
-	single     *bool
-	pad        int
-	blockSize  int
-	windowSize int
-	level      EncoderLevel
-	fullZero   bool
-	noEntropy  bool
+	concurrent      int
+	level           EncoderLevel
+	single          *bool
+	pad             int
+	blockSize       int
+	windowSize      int
+	crc             bool
+	fullZero        bool
+	noEntropy       bool
+	allLitEntropy   bool
+	customWindow    bool
+	customALEntropy bool
 func (o *encoderOptions) setDefault() {
@@ -30,7 +33,7 @@ func (o *encoderOptions) setDefault() {
 		crc:        true,
 		single:     nil,
 		blockSize:  1 << 16,
-		windowSize: 1 << 22,
+		windowSize: 8 << 20,
 		level:      SpeedDefault,
@@ -39,9 +42,11 @@ func (o *encoderOptions) setDefault() {
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
 	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{maxMatchOff: int32(o.windowSize)}}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
+	case SpeedBetterCompression:
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	case SpeedFastest:
-		return &fastEncoder{maxMatchOff: int32(o.windowSize)}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	panic("unknown compression level")
@@ -67,7 +72,7 @@ func WithEncoderConcurrency(n int) EOption {
 // WithWindowSize will set the maximum allowed back-reference distance.
-// The value must be a power of two between WindowSizeMin and WindowSizeMax.
+// The value must be a power of two between MinWindowSize and MaxWindowSize.
 // A larger value will enable better compression but allocate more memory and,
 // for above-default values, take considerably longer.
 // The default value is determined by the compression level.
@@ -83,6 +88,7 @@ func WithWindowSize(n int) EOption {
 		o.windowSize = n
+		o.customWindow = true
 		if o.blockSize > o.windowSize {
 			o.blockSize = o.windowSize
@@ -130,18 +136,18 @@ const (
 	// This is roughly equivalent to the default Zstandard mode (level 3).
+	// SpeedBetterCompression will yield better compression than the default.
+	// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.
+	// By using this, notice that CPU usage may go up in the future.
+	SpeedBetterCompression
 	// speedLast should be kept as the last actual compression option.
 	// The is not for external usage, but is used to keep track of the valid options.
-	// SpeedBetterCompression will (in the future) yield better compression than the default,
-	// but at approximately 4x the CPU usage of the default.
-	// For now this is not implemented.
-	SpeedBetterCompression = SpeedDefault
 	// SpeedBestCompression will choose the best available compression option.
 	// For now this is not implemented.
-	SpeedBestCompression = SpeedDefault
+	SpeedBestCompression = SpeedBetterCompression
 // EncoderLevelFromString will convert a string representation of an encoding level back
@@ -163,8 +169,10 @@ func EncoderLevelFromZstd(level int) EncoderLevel {
 	switch {
 	case level < 3:
 		return SpeedFastest
-	case level >= 3:
+	case level >= 3 && level < 6:
 		return SpeedDefault
+	case level > 5:
+		return SpeedBetterCompression
 	return SpeedDefault
@@ -176,6 +184,8 @@ func (e EncoderLevel) String() string {
 		return "fastest"
 	case SpeedDefault:
 		return "default"
+	case SpeedBetterCompression:
+		return "better"
 		return "invalid"
@@ -189,6 +199,20 @@ func WithEncoderLevel(l EncoderLevel) EOption {
 			return fmt.Errorf("unknown encoder level")
 		o.level = l
+		if !o.customWindow {
+			switch o.level {
+			case SpeedFastest:
+				o.windowSize = 4 << 20
+			case SpeedDefault:
+				o.windowSize = 8 << 20
+			case SpeedBetterCompression:
+				o.windowSize = 16 << 20
+			}
+		}
+		if !o.customALEntropy {
+			o.allLitEntropy = l > SpeedFastest
+		}
 		return nil
@@ -203,6 +227,18 @@ func WithZeroFrames(b bool) EOption {
+// WithAllLitEntropyCompression will apply entropy compression if no matches are found.
+// Disabling this will skip incompressible data faster, but in cases with no matches but
+// skewed character distribution compression is lost.
+// Default value depends on the compression level selected.
+func WithAllLitEntropyCompression(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.customALEntropy = true
+		o.allLitEntropy = b
+		return nil
+	}
 // WithNoEntropyCompression will always skip entropy compression of literals.
 // This can be useful if content has matches, but unlikely to benefit from entropy
 // compression. Usually the slight speed improvement is not worth enabling this.
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index cda590b5f7..fc4a566d39 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -16,16 +16,11 @@ import (
 type frameDec struct {
-	o         decoderOptions
-	crc       hash.Hash64
-	frameDone sync.WaitGroup
-	offset    int64
+	o      decoderOptions
+	crc    hash.Hash64
+	offset int64
-	WindowSize       uint64
-	DictionaryID     uint32
-	FrameContentSize uint64
-	HasCheckSum      bool
-	SingleSegment    bool
+	WindowSize uint64
 	// maxWindowSize is the maximum windows size to support.
 	// should never be bigger than max-int.
@@ -42,9 +37,16 @@ type frameDec struct {
 	// Byte buffer that can be reused for small input blocks.
 	bBuf byteBuf
+	FrameContentSize uint64
+	frameDone        sync.WaitGroup
+	DictionaryID  *uint32
+	HasCheckSum   bool
+	SingleSegment bool
 	// asyncRunning indicates whether the async routine processes input on 'decoding'.
-	asyncRunning   bool
 	asyncRunningMu sync.Mutex
+	asyncRunning   bool
 const (
@@ -140,7 +142,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = 0
+	d.DictionaryID = nil
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@@ -152,19 +154,22 @@ func (d *frameDec) reset(br byteBuffer) error {
 			return io.ErrUnexpectedEOF
+		var id uint32
 		switch size {
 		case 1:
-			d.DictionaryID = uint32(b[0])
+			id = uint32(b[0])
 		case 2:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+			id = uint32(b[0]) | (uint32(b[1]) << 8)
 		case 4:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		if debug {
-			println("Dict size", size, "ID:", d.DictionaryID)
+			println("Dict size", size, "ID:", id)
-		if d.DictionaryID != 0 {
-			return ErrUnknownDictionary
+		if id > 0 {
+			// ID 0 means "sorry, no dictionary anyway".
+			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
+			d.DictionaryID = &id
@@ -231,7 +236,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 		return ErrWindowSizeTooSmall
 	d.history.windowSize = int(d.WindowSize)
-	d.history.maxSize = d.history.windowSize + maxBlockSize
+	if d.o.lowMem && d.history.windowSize < maxBlockSize {
+		d.history.maxSize = d.history.windowSize * 2
+	} else {
+		d.history.maxSize = d.history.windowSize + maxBlockSize
+	}
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -318,8 +327,8 @@ func (d *frameDec) checkCRC() error {
 func (d *frameDec) initAsync() {
 	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 20MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize*10
+		// set max extra size history to 10MB.
+		d.history.maxSize = d.history.windowSize + maxBlockSize*5
 	// re-alloc if more than one extra block size.
 	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
@@ -345,8 +354,6 @@ func (d *frameDec) initAsync() {
 // When the frame has finished decoding the *bufio.Reader
 // containing the remaining input will be sent on frameDec.frameDone.
 func (d *frameDec) startDecoder(output chan decodeOutput) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	written := int64(0)
 	defer func() {
@@ -439,8 +446,6 @@ func (d *frameDec) startDecoder(output chan decodeOutput) {
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
-	// TODO: Init to dictionary
-	d.history.reset()
 	saved := d.history.b
 	// We use the history for output to avoid copying it.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index e002be98b9..e6d3d49b39 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -19,7 +19,7 @@ const (
 	 *  Increasing memory usage improves compression ratio
 	 *  Reduced memory usage can improve speed, due to cache effect
 	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage = 11
+	maxMemoryUsage = tablelogAbsoluteMax + 2
 	maxTableLog    = maxMemoryUsage - 2
 	maxTablesize   = 1 << maxTableLog
@@ -55,7 +55,7 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	if b.remain() < 4 {
 		return errors.New("input too small")
-	bitStream := b.Uint32()
+	bitStream := b.Uint32NC()
 	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
 	if nbBits > tablelogAbsoluteMax {
 		println("Invalid tablelog:", nbBits)
@@ -79,7 +79,8 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				n0 += 24
 				if r := b.remain(); r > 5 {
-					bitStream = b.Uint32() >> bitCount
+					// The check above should make sure we can read 32 bits
+					bitStream = b.Uint32NC() >> bitCount
 				} else {
 					// end of bit stream
 					bitStream >>= 16
@@ -104,10 +105,11 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
-			if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+			if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 				b.advance(bitCount >> 3)
 				bitCount &= 7
-				bitStream = b.Uint32() >> bitCount
+				// The check above should make sure we can read 32 bits
+				bitStream = b.Uint32NC() >> bitCount
 			} else {
 				bitStream >>= 2
@@ -148,17 +150,16 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 			threshold >>= 1
-		//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "remain:", b.remain())
-		if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+		if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 			b.advance(bitCount >> 3)
 			bitCount &= 7
+			// The check above should make sure we can read 32 bits
+			bitStream = b.Uint32NC() >> (bitCount & 31)
 		} else {
 			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
 			b.off = len(b.b) - 4
-			//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "iend", iend)
+			bitStream = b.Uint32() >> (bitCount & 31)
-		bitStream = b.Uint32() >> (bitCount & 31)
-		//printf("bitstream is now: 0b%b", bitStream)
 	s.symbolLen = charnum
 	if s.symbolLen <= 1 {
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index e8c419bd53..f418f50fcd 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -17,6 +17,7 @@ type history struct {
 	windowSize    int
 	maxSize       int
 	error         bool
+	dict          *dict
 // reset will reset the history to initial state of a frame.
@@ -36,12 +37,27 @@ func (h *history) reset() {
 	h.decoders = sequenceDecs{}
 	if h.huffTree != nil {
-		huffDecoderPool.Put(h.huffTree)
+		if h.dict == nil || h.dict.litDec != h.huffTree {
+			huffDecoderPool.Put(h.huffTree)
+		}
 	h.huffTree = nil
+	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
+func (h *history) setDict(dict *dict) {
+	if dict == nil {
+		return
+	}
+	h.dict = dict
+	h.decoders.litLengths = dict.llDec
+	h.decoders.offsets = dict.ofDec
+	h.decoders.matchLengths = dict.mlDec
+	h.recentOffsets = dict.offsets
+	h.huffTree = dict.litDec
 // append bytes to history.
 // This function will make sure there is space for it,
 // if the buffer has been allocated with enough extra space.
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 15a45f7b50..7ff870400d 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -62,8 +62,10 @@ type sequenceDecs struct {
 	matchLengths sequenceDec
 	prevOffset   [3]int
 	hist         []byte
+	dict         []byte
 	literals     []byte
 	out          []byte
+	windowSize   int
 	maxBits      uint8
@@ -82,7 +84,12 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	s.hist = hist.b
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
+	s.windowSize = hist.windowSize
 	s.out = out
+	s.dict = nil
+	if hist.dict != nil {
+		s.dict = hist.dict.content
+	}
 	return nil
@@ -98,23 +105,78 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			printf("reading sequence %d, exceeded available data\n", seqs-i)
 			return io.ErrUnexpectedEOF
-		var litLen, matchOff, matchLen int
+		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
-			litLen, matchOff, matchLen = s.nextFast(br, llState, mlState, ofState)
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("temp was 0")
+						temp = 1
+					}
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
 		} else {
-			litLen, matchOff, matchLen = s.next(br, llState, mlState, ofState)
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
 		if debugSequences {
-			println("Seq", seqs-i-1, "Litlen:", litLen, "matchOff:", matchOff, "(abs) matchLen:", matchLen)
+			println("Seq", seqs-i-1, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
-		if litLen > len(s.literals) {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", litLen, len(s.literals))
+		if ll > len(s.literals) {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
-		size := litLen + matchLen + len(s.out)
+		size := ll + ml + len(s.out)
 		if size-startSize > maxBlockSize {
 			return fmt.Errorf("output (%d) bigger than max block size", size)
@@ -125,49 +187,70 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			s.out = append(s.out, make([]byte, maxBlockSize)...)
 			s.out = s.out[:len(s.out)-maxBlockSize]
-		if matchLen > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", matchLen)
-		}
-		if matchOff > len(s.out)+len(hist)+litLen {
-			return fmt.Errorf("match offset (%d) bigger than current history (%d)", matchOff, len(s.out)+len(hist)+litLen)
-		}
-		if matchOff == 0 && matchLen > 0 {
-			return fmt.Errorf("zero matchoff and matchlen > 0")
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
-		s.out = append(s.out, s.literals[:litLen]...)
-		s.literals = s.literals[litLen:]
+		// Add literals
+		s.out = append(s.out, s.literals[:ll]...)
+		s.literals = s.literals[ll:]
 		out := s.out
+		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+			// we may be in dictionary.
+			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+			}
+			end := dictO + ml
+			if end > len(s.dict) {
+				out = append(out, s.dict[dictO:]...)
+				mo -= len(s.dict) - dictO
+				ml -= len(s.dict) - dictO
+			} else {
+				out = append(out, s.dict[dictO:end]...)
+				mo = 0
+				ml = 0
+			}
+		}
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := matchOff - len(s.out); v > 0 {
+		if v := mo - len(s.out); v > 0 {
 			// v is the start position in history from end.
 			start := len(s.hist) - v
-			if matchLen > v {
+			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
 				out = append(out, s.hist[start:]...)
-				matchOff -= v
-				matchLen -= v
+				mo -= v
+				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+matchLen]...)
-				matchLen = 0
+				out = append(out, s.hist[start:start+ml]...)
+				ml = 0
 		// We must be in current buffer now
-		if matchLen > 0 {
-			start := len(s.out) - matchOff
-			if matchLen <= len(s.out)-start {
+		if ml > 0 {
+			start := len(s.out) - mo
+			if ml <= len(s.out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+matchLen]...)
+				out = append(out, s.out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
-				out = out[:len(out)+matchLen]
-				src := out[start : start+matchLen]
+				out = out[:len(out)+ml]
+				src := out[start : start+ml]
 				// Destination is the space we just added.
-				dst := out[len(out)-matchLen:]
+				dst := out[len(out)-ml:]
 				dst = dst[:len(src)]
 				for i := range src {
 					dst[i] = src[i]
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 356956ba25..690428cd24 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -178,7 +178,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 				r.err = ErrSnappyCorrupt
 				return written, r.err
-			err = r.block.encode(false)
+			err = r.block.encode(false, false)
 			switch err {
 			case errIncompressible:
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 5e0b64cccf..0807719c8b 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -87,6 +87,17 @@ func printf(format string, a ...interface{}) {
+// matchLenFast does matching, but will not match the last up to 7 bytes.
+func matchLenFast(a, b []byte) int {
+	endI := len(a) & (math.MaxInt32 - 7)
+	for i := 0; i < endI; i += 8 {
+		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+			return i + bits.TrailingZeros64(diff)>>3
+		}
+	}
+	return endI
 // matchLen returns the maximum length.
 // a must be the shortest of the two.
 // The function also returns whether all bytes matched.
@@ -97,33 +108,18 @@ func matchLen(a, b []byte) int {
 			return i + (bits.TrailingZeros64(diff) >> 3)
 	checked := (len(a) >> 3) << 3
 	a = a[checked:]
 	b = b[checked:]
-	// TODO: We could do a 4 check.
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 	return len(a) + checked
-// matchLen returns a match length in src between index s and t
-func matchLenIn(src []byte, s, t int32) int32 {
-	s1 := len(src)
-	b := src[t:]
-	a := src[s:s1]
-	b = b[:len(a)]
-	// Extend the match to be as long as possible.
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i)
-		}
-	}
-	return int32(len(a))
 func load3232(b []byte, i int32) uint32 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
diff --git a/vendor/gopkg.in/ini.v1/file.go b/vendor/gopkg.in/ini.v1/file.go
index f95606f90f..7eecacaf2b 100644
--- a/vendor/gopkg.in/ini.v1/file.go
+++ b/vendor/gopkg.in/ini.v1/file.go
@@ -55,6 +55,9 @@ func newFile(dataSources []dataSource, opts LoadOptions) *File {
 	if len(opts.KeyValueDelimiterOnWrite) == 0 {
 		opts.KeyValueDelimiterOnWrite = "="
+	if len(opts.ChildSectionDelimiter) == 0 {
+		opts.ChildSectionDelimiter = "."
+	}
 	return &File{
 		BlockMode:   true,
@@ -82,7 +85,7 @@ func (f *File) NewSection(name string) (*Section, error) {
 		return nil, errors.New("empty section name")
-	if f.options.Insensitive && name != DefaultSection {
+	if (f.options.Insensitive || f.options.InsensitiveSections) && name != DefaultSection {
 		name = strings.ToLower(name)
@@ -144,7 +147,7 @@ func (f *File) SectionsByName(name string) ([]*Section, error) {
 	if len(name) == 0 {
 		name = DefaultSection
-	if f.options.Insensitive {
+	if f.options.Insensitive || f.options.InsensitiveSections {
 		name = strings.ToLower(name)
@@ -236,7 +239,7 @@ func (f *File) DeleteSectionWithIndex(name string, index int) error {
 	if len(name) == 0 {
 		name = DefaultSection
-	if f.options.Insensitive {
+	if f.options.Insensitive || f.options.InsensitiveSections {
 		name = strings.ToLower(name)
@@ -347,7 +350,7 @@ func (f *File) writeToBuffer(indent string) (*bytes.Buffer, error) {
-		if i > 0 || DefaultHeader {
+		if i > 0 || DefaultHeader || (i == 0 && strings.ToUpper(sec.name) != DefaultSection) {
 			if _, err := buf.WriteString("[" + sname + "]" + LineBreak); err != nil {
 				return nil, err
@@ -451,6 +454,8 @@ func (f *File) writeToBuffer(indent string) (*bytes.Buffer, error) {
 					val = `"""` + val + `"""`
 				} else if !f.options.IgnoreInlineComment && strings.ContainsAny(val, "#;") {
 					val = "`" + val + "`"
+				} else if len(strings.TrimSpace(val)) != len(val) {
+					val = `"` + val + `"`
 				if _, err := buf.WriteString(equalSign + val + LineBreak); err != nil {
 					return nil, err
diff --git a/vendor/gopkg.in/ini.v1/ini.go b/vendor/gopkg.in/ini.v1/ini.go
index 2961543f91..80ebf3ad69 100644
--- a/vendor/gopkg.in/ini.v1/ini.go
+++ b/vendor/gopkg.in/ini.v1/ini.go
@@ -71,6 +71,10 @@ type LoadOptions struct {
 	Loose bool
 	// Insensitive indicates whether the parser forces all section and key names to lowercase.
 	Insensitive bool
+	// InsensitiveSections indicates whether the parser forces all section to lowercase.
+	InsensitiveSections bool
+	// InsensitiveKeys indicates whether the parser forces all key names to lowercase.
+	InsensitiveKeys bool
 	// IgnoreContinuation indicates whether to ignore continuation lines while parsing.
 	IgnoreContinuation bool
 	// IgnoreInlineComment indicates whether to ignore comments at the end of value and treat it as part of value.
@@ -109,6 +113,8 @@ type LoadOptions struct {
 	KeyValueDelimiters string
 	// KeyValueDelimiters is the delimiter that are used to separate key and value output. By default, it is "=".
 	KeyValueDelimiterOnWrite string
+	// ChildSectionDelimiter is the delimiter that is used to separate child sections. By default, it is ".".
+	ChildSectionDelimiter string
 	// PreserveSurroundedQuote indicates whether to preserve surrounded quote (single and double quotes).
 	PreserveSurroundedQuote bool
 	// DebugFunc is called to collect debug information (currently only useful to debug parsing Python-style multiline values).
diff --git a/vendor/gopkg.in/ini.v1/parser.go b/vendor/gopkg.in/ini.v1/parser.go
index ea6c08b029..65147166f9 100644
--- a/vendor/gopkg.in/ini.v1/parser.go
+++ b/vendor/gopkg.in/ini.v1/parser.go
@@ -377,7 +377,7 @@ func (f *File) parse(reader io.Reader) (err error) {
 	// Ignore error because default section name is never empty string.
 	name := DefaultSection
-	if f.options.Insensitive {
+	if f.options.Insensitive || f.options.InsensitiveSections {
 		name = strings.ToLower(DefaultSection)
 	section, _ := f.NewSection(name)
@@ -469,7 +469,7 @@ func (f *File) parse(reader io.Reader) (err error) {
 			inUnparseableSection = false
 			for i := range f.options.UnparseableSections {
 				if f.options.UnparseableSections[i] == name ||
-					(f.options.Insensitive && strings.EqualFold(f.options.UnparseableSections[i], name)) {
+					((f.options.Insensitive || f.options.InsensitiveSections) && strings.EqualFold(f.options.UnparseableSections[i], name)) {
 					inUnparseableSection = true
diff --git a/vendor/gopkg.in/ini.v1/section.go b/vendor/gopkg.in/ini.v1/section.go
index 6ba5ac2905..afaa97c97e 100644
--- a/vendor/gopkg.in/ini.v1/section.go
+++ b/vendor/gopkg.in/ini.v1/section.go
@@ -66,7 +66,7 @@ func (s *Section) SetBody(body string) {
 func (s *Section) NewKey(name, val string) (*Key, error) {
 	if len(name) == 0 {
 		return nil, errors.New("error creating new key: empty key name")
-	} else if s.f.options.Insensitive {
+	} else if s.f.options.Insensitive || s.f.options.InsensitiveKeys {
 		name = strings.ToLower(name)
@@ -109,7 +109,7 @@ func (s *Section) GetKey(name string) (*Key, error) {
 	if s.f.BlockMode {
-	if s.f.options.Insensitive {
+	if s.f.options.Insensitive || s.f.options.InsensitiveKeys {
 		name = strings.ToLower(name)
 	key := s.keys[name]
@@ -121,7 +121,7 @@ func (s *Section) GetKey(name string) (*Key, error) {
 		// Check if it is a child-section.
 		sname := s.name
 		for {
-			if i := strings.LastIndex(sname, "."); i > -1 {
+			if i := strings.LastIndex(sname, s.f.options.ChildSectionDelimiter); i > -1 {
 				sname = sname[:i]
 				sec, err := s.f.GetSection(sname)
 				if err != nil {
@@ -188,7 +188,7 @@ func (s *Section) ParentKeys() []*Key {
 	var parentKeys []*Key
 	sname := s.name
 	for {
-		if i := strings.LastIndex(sname, "."); i > -1 {
+		if i := strings.LastIndex(sname, s.f.options.ChildSectionDelimiter); i > -1 {
 			sname = sname[:i]
 			sec, err := s.f.GetSection(sname)
 			if err != nil {
@@ -245,7 +245,7 @@ func (s *Section) DeleteKey(name string) {
 // For example, "[parent.child1]" and "[parent.child12]" are child sections
 // of section "[parent]".
 func (s *Section) ChildSections() []*Section {
-	prefix := s.name + "."
+	prefix := s.name + s.f.options.ChildSectionDelimiter
 	children := make([]*Section, 0, 3)
 	for _, name := range s.f.sectionList {
 		if strings.HasPrefix(name, prefix) {
diff --git a/vendor/gopkg.in/ini.v1/struct.go b/vendor/gopkg.in/ini.v1/struct.go
index 9be40a9200..ad90300f60 100644
--- a/vendor/gopkg.in/ini.v1/struct.go
+++ b/vendor/gopkg.in/ini.v1/struct.go
@@ -479,7 +479,7 @@ func reflectSliceWithProperType(key *Key, field reflect.Value, delim string, all
 				_ = keyWithShadows.AddShadow(val)
-		key = keyWithShadows
+		*key = *keyWithShadows
 		return nil
@@ -595,7 +595,7 @@ func (s *Section) reflectFrom(val reflect.Value) error {
-		if (tpField.Type.Kind() == reflect.Ptr && tpField.Anonymous) ||
+		if (tpField.Type.Kind() == reflect.Ptr && tpField.Type.Elem().Kind() == reflect.Struct) ||
 			(tpField.Type.Kind() == reflect.Struct && tpField.Type.Name() != "Time") {
 			// Note: The only error here is section doesn't exist.
 			sec, err := s.f.GetSection(fieldName)
diff --git a/vendor/modules.txt b/vendor/modules.txt
index c2038c5817..8360fc5aeb 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -25,7 +25,7 @@ gitea.com/macaron/cors
 # gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
 ## explicit
-# gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae
+# gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5
 ## explicit
 # gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
@@ -34,7 +34,7 @@ gitea.com/macaron/i18n
 # gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
 ## explicit
-# gitea.com/macaron/macaron v1.4.0
+# gitea.com/macaron/macaron v1.5.0
 ## explicit
 # gitea.com/macaron/session v0.0.0-20191207215012-613cebf0674d
@@ -479,7 +479,7 @@ github.com/keybase/go-crypto/openpgp/errors
-# github.com/klauspost/compress v1.10.2
+# github.com/klauspost/compress v1.10.11
 ## explicit
@@ -768,7 +768,7 @@ go.mongodb.org/mongo-driver/bson/bsonrw
-# golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de
+# golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a
 ## explicit
@@ -933,7 +933,7 @@ gopkg.in/asn1-ber.v1
 # gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
 ## explicit
-# gopkg.in/ini.v1 v1.57.0
+# gopkg.in/ini.v1 v1.60.2
 ## explicit
 # gopkg.in/ldap.v3 v3.0.2