diff --git a/caddy/caddymain/run.go b/caddy/caddymain/run.go index f1415b82..6ffcd5c6 100644 --- a/caddy/caddymain/run.go +++ b/caddy/caddymain/run.go @@ -27,6 +27,7 @@ import ( "strings" "github.com/google/uuid" + "github.com/klauspost/cpuid" "github.com/mholt/caddy" "github.com/mholt/caddy/caddytls" "github.com/mholt/caddy/diagnostics" @@ -51,6 +52,7 @@ func init() { flag.StringVar(&caddytls.DefaultEmail, "email", "", "Default ACME CA account email address") flag.DurationVar(&acme.HTTPClient.Timeout, "catimeout", acme.HTTPClient.Timeout, "Default ACME CA HTTP timeout") flag.StringVar(&logfile, "log", "", "Process log file") + flag.BoolVar(&noDiag, "no-diagnostics", false, "Disable diagnostic reporting") flag.StringVar(&caddy.PidFile, "pidfile", "", "Path to write pid file") flag.BoolVar(&caddy.Quiet, "quiet", false, "Quiet mode (no initialization output)") flag.StringVar(&revoke, "revoke", "", "Hostname for which to revoke the certificate") @@ -88,7 +90,9 @@ func Run() { } // initialize diagnostics client - initDiagnostics() + if !noDiag { + initDiagnostics() + } // Check for one-time actions if revoke != "" { @@ -146,6 +150,23 @@ func Run() { // Execute instantiation events caddy.EmitEvent(caddy.InstanceStartupEvent, instance) + // Begin diagnostics (these are no-ops if diagnostics disabled) + diagnostics.Set("caddy_version", appVersion) + // TODO: plugins + diagnostics.Set("num_listeners", len(instance.Servers())) + diagnostics.Set("os", runtime.GOOS) + diagnostics.Set("arch", runtime.GOARCH) + diagnostics.Set("cpu", struct { + NumLogical int `json:"num_logical"` + AESNI bool `json:"aes_ni"` + BrandName string `json:"brand_name"` + }{ + NumLogical: runtime.NumCPU(), + AESNI: cpuid.CPU.AesNi(), + BrandName: cpuid.CPU.BrandName, + }) + diagnostics.StartEmitting() + // Twiddle your thumbs instance.Wait() } @@ -321,6 +342,7 @@ var ( version bool plugins bool validate bool + noDiag bool ) // Build information obtained with the help of -ldflags diff --git a/caddyhttp/httpserver/plugin.go b/caddyhttp/httpserver/plugin.go index 643eea7f..58a63619 100644 --- a/caddyhttp/httpserver/plugin.go +++ b/caddyhttp/httpserver/plugin.go @@ -29,6 +29,7 @@ import ( "github.com/mholt/caddy/caddyfile" "github.com/mholt/caddy/caddyhttp/staticfiles" "github.com/mholt/caddy/caddytls" + "github.com/mholt/caddy/diagnostics" ) const serverType = "http" @@ -205,6 +206,8 @@ func (h *httpContext) MakeServers() ([]caddy.Server, error) { } } + diagnostics.Set("num_sites", len(h.siteConfigs)) + // we must map (group) each config to a bind address groups, err := groupSiteConfigsByListenAddr(h.siteConfigs) if err != nil { diff --git a/caddyhttp/httpserver/server.go b/caddyhttp/httpserver/server.go index 92f2b6fd..5033bb21 100644 --- a/caddyhttp/httpserver/server.go +++ b/caddyhttp/httpserver/server.go @@ -36,6 +36,7 @@ import ( "github.com/mholt/caddy" "github.com/mholt/caddy/caddyhttp/staticfiles" "github.com/mholt/caddy/caddytls" + "github.com/mholt/caddy/diagnostics" ) // Server is the HTTP server implementation. @@ -345,6 +346,8 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { } }() + go diagnostics.AppendUniqueString("user_agent", r.Header.Get("User-Agent")) + // copy the original, unchanged URL into the context // so it can be referenced by middlewares urlCopy := *r.URL diff --git a/caddytls/client.go b/caddytls/client.go index 26ef6a3c..44f39480 100644 --- a/caddytls/client.go +++ b/caddytls/client.go @@ -26,6 +26,7 @@ import ( "time" "github.com/mholt/caddy" + "github.com/mholt/caddy/diagnostics" "github.com/xenolf/lego/acme" ) @@ -276,6 +277,8 @@ Attempts: break } + go diagnostics.Increment("acme_certificates_obtained") + return nil } @@ -350,8 +353,9 @@ func (c *ACMEClient) Renew(name string) error { return errors.New("too many renewal attempts; last error: " + err.Error()) } - // Executes Cert renew events caddy.EmitEvent(caddy.CertRenewEvent, name) + go diagnostics.Increment("acme_certificates_obtained") + go diagnostics.Increment("acme_certificates_renewed") return saveCertResource(storage, newCertMeta) } diff --git a/diagnostics/collection.go b/diagnostics/collection.go new file mode 100644 index 00000000..40e42e5b --- /dev/null +++ b/diagnostics/collection.go @@ -0,0 +1,250 @@ +// Copyright 2015 Light Code Labs, LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package diagnostics + +import ( + "log" + + "github.com/google/uuid" +) + +// Init initializes this package so that it may +// be used. Do not call this function more than +// once. Init panics if it is called more than +// once or if the UUID value is empty. Once this +// function is called, the rest of the package +// may safely be used. If this function is not +// called, the collector functions may still be +// invoked, but they will be no-ops. +func Init(instanceID uuid.UUID) { + if enabled { + panic("already initialized") + } + if instanceID.String() == "" { + panic("empty UUID") + } + instanceUUID = instanceID + enabled = true +} + +// StartEmitting sends the current payload and begins the +// transmission cycle for updates. This is the first +// update sent, and future ones will be sent until +// StopEmitting is called. +// +// This function is non-blocking (it spawns a new goroutine). +// +// This function panics if it was called more than once. +// It is a no-op if this package was not initialized. +func StartEmitting() { + if !enabled { + return + } + updateTimerMu.Lock() + if updateTimer != nil { + updateTimerMu.Unlock() + panic("updates already started") + } + updateTimerMu.Unlock() + updateMu.Lock() + if updating { + updateMu.Unlock() + panic("update already in progress") + } + updateMu.Unlock() + go logEmit(false) +} + +// StopEmitting sends the current payload and terminates +// the update cycle. No more updates will be sent. +// +// It is a no-op if the package was never initialized +// or if emitting was never started. +func StopEmitting() { + if !enabled { + return + } + updateTimerMu.Lock() + if updateTimer == nil { + updateTimerMu.Unlock() + return + } + updateTimerMu.Unlock() + logEmit(true) +} + +// Set puts a value in the buffer to be included +// in the next emission. It overwrites any +// previous value. +// +// This function is safe for multiple goroutines, +// and it is recommended to call this using the +// go keyword after the call to SendHello so it +// doesn't block crucial code. +func Set(key string, val interface{}) { + if !enabled { + return + } + bufferMu.Lock() + if bufferItemCount >= maxBufferItems { + bufferMu.Unlock() + return + } + if _, ok := buffer[key]; !ok { + bufferItemCount++ + } + buffer[key] = val + bufferMu.Unlock() +} + +// Append appends value to a list named key. +// If key is new, a new list will be created. +// If key maps to a type that is not a list, +// an error is logged, and this is a no-op. +// +// TODO: is this function needed/useful? +func Append(key string, value interface{}) { + if !enabled { + return + } + bufferMu.Lock() + if bufferItemCount >= maxBufferItems { + bufferMu.Unlock() + return + } + // TODO: Test this... + bufVal, inBuffer := buffer[key] + sliceVal, sliceOk := bufVal.([]interface{}) + if inBuffer && !sliceOk { + bufferMu.Unlock() + log.Printf("[PANIC] Diagnostics: key %s already used for non-slice value", key) + return + } + if sliceVal == nil { + buffer[key] = []interface{}{value} + } else if sliceOk { + buffer[key] = append(sliceVal, value) + } + bufferItemCount++ + bufferMu.Unlock() +} + +// AppendUniqueString adds value to a set named key. +// Set items are unordered. Values in the set +// are unique, but repeat values are counted. +// +// If key is new, a new set will be created. +// If key maps to a type that is not a string +// set, an error is logged, and this is a no-op. +func AppendUniqueString(key, value string) { + if !enabled { + return + } + bufferMu.Lock() + if bufferItemCount >= maxBufferItems { + bufferMu.Unlock() + return + } + bufVal, inBuffer := buffer[key] + mapVal, mapOk := bufVal.(map[string]int) + if inBuffer && !mapOk { + bufferMu.Unlock() + log.Printf("[PANIC] Diagnostics: key %s already used for non-map value", key) + return + } + if mapVal == nil { + buffer[key] = map[string]int{value: 1} + bufferItemCount++ + } else if mapOk { + mapVal[value]++ + } + bufferMu.Unlock() +} + +// AppendUniqueInt adds value to a set named key. +// Set items are unordered. Values in the set +// are unique, but repeat values are counted. +// +// If key is new, a new set will be created. +// If key maps to a type that is not an integer +// set, an error is logged, and this is a no-op. +func AppendUniqueInt(key string, value int) { + if !enabled { + return + } + bufferMu.Lock() + if bufferItemCount >= maxBufferItems { + bufferMu.Unlock() + return + } + bufVal, inBuffer := buffer[key] + mapVal, mapOk := bufVal.(map[int]int) + if inBuffer && !mapOk { + bufferMu.Unlock() + log.Printf("[PANIC] Diagnostics: key %s already used for non-map value", key) + return + } + if mapVal == nil { + buffer[key] = map[int]int{value: 1} + bufferItemCount++ + } else if mapOk { + mapVal[value]++ + } + bufferMu.Unlock() +} + +// Increment adds 1 to a value named key. +// If it does not exist, it is created with +// a value of 1. If key maps to a type that +// is not an integer, an error is logged, +// and this is a no-op. +func Increment(key string) { + incrementOrDecrement(key, true) +} + +// Decrement is the same as increment except +// it subtracts 1. +func Decrement(key string) { + incrementOrDecrement(key, false) +} + +// inc == true: increment +// inc == false: decrement +func incrementOrDecrement(key string, inc bool) { + if !enabled { + return + } + bufferMu.Lock() + bufVal, inBuffer := buffer[key] + intVal, intOk := bufVal.(int) + if inBuffer && !intOk { + bufferMu.Unlock() + log.Printf("[PANIC] Diagnostics: key %s already used for non-integer value", key) + return + } + if !inBuffer { + if bufferItemCount >= maxBufferItems { + bufferMu.Unlock() + return + } + bufferItemCount++ + } + if inc { + buffer[key] = intVal + 1 + } else { + buffer[key] = intVal - 1 + } + bufferMu.Unlock() +} diff --git a/diagnostics/diagnostics.go b/diagnostics/diagnostics.go index a1d050d4..dd4cac87 100644 --- a/diagnostics/diagnostics.go +++ b/diagnostics/diagnostics.go @@ -12,14 +12,252 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package diagnostics implements the client for server-side diagnostics +// of the network. Functions in this package are synchronous and blocking +// unless otherwise specified. For convenience, most functions here do +// not return errors, but errors are logged to the standard logger. +// +// To use this package, first call Init(). You can then call any of the +// collection/aggregation functions. Call StartEmitting() when you are +// ready to begin sending diagnostic updates. +// +// When collecting metrics (functions like Set, Append*, or Increment), +// it may be desirable and even recommended to run invoke them in a new +// goroutine (use the go keyword) in case there is lock contention; +// they are thread-safe (unless noted), and you may not want them to +// block the main thread of execution. However, sometimes blocking +// may be necessary too; for example, adding startup metrics to the +// buffer before the call to StartEmitting(). package diagnostics import ( + "bytes" + "encoding/json" + "fmt" + "log" + "net/http" + "strings" + "sync" + "time" + "github.com/google/uuid" ) -func Init(uuid uuid.UUID) { - instanceUUID = uuid +// logEmit calls emit and then logs the error, if any. +func logEmit(final bool) { + err := emit(final) + if err != nil { + log.Printf("[ERROR] Sending diganostics: %v", err) + } } +// emit sends an update to the diagnostics server. +// If final is true, no future updates will be scheduled. +// Otherwise, the next update will be scheduled. +func emit(final bool) error { + if !enabled { + return fmt.Errorf("diagnostics not enabled") + } + + // ensure only one update happens at a time; + // skip update if previous one still in progress + updateMu.Lock() + if updating { + updateMu.Unlock() + log.Println("[NOTICE] Skipping this diagnostics update because previous one is still working") + return nil + } + updating = true + updateMu.Unlock() + defer func() { + updateMu.Lock() + updating = false + updateMu.Unlock() + }() + + // terminate any pending update if this is the last one + if final { + updateTimerMu.Lock() + updateTimer.Stop() + updateTimer = nil + updateTimerMu.Unlock() + } + + payloadBytes, err := makePayloadAndResetBuffer() + if err != nil { + return err + } + + // this will hold the server's reply + var reply Response + + // transmit the payload - use a loop to retry in case of failure + for i := 0; i < 4; i++ { + if i > 0 && err != nil { + // don't hammer the server; first failure might have been + // a fluke, but back off more after that + log.Printf("[WARNING] Sending diagnostics (attempt %d): %v - waiting and retrying", i, err) + time.Sleep(time.Duration(i*i*i) * time.Second) + } + + // send it + var resp *http.Response + resp, err = httpClient.Post(endpoint+instanceUUID.String(), "application/json", bytes.NewReader(payloadBytes)) + if err != nil { + continue + } + + // ensure we can read the response + if ct := resp.Header.Get("Content-Type"); (resp.StatusCode < 300 || resp.StatusCode >= 400) && + !strings.Contains(ct, "json") { + err = fmt.Errorf("diagnostics server replied with unknown content-type: %s", ct) + resp.Body.Close() + continue + } + + // read the response body + err = json.NewDecoder(resp.Body).Decode(&reply) + resp.Body.Close() // close response body as soon as we're done with it + if err != nil { + continue + } + + // ensure we won't slam the diagnostics server + if reply.NextUpdate < 1*time.Second { + reply.NextUpdate = defaultUpdateInterval + } + + // make sure we didn't send the update too soon; if so, + // just wait and try again -- this is a special case of + // error that we handle differently, as you can see + if resp.StatusCode == http.StatusTooManyRequests { + log.Printf("[NOTICE] Sending diagnostics: we were too early; waiting %s before trying again", reply.NextUpdate) + time.Sleep(reply.NextUpdate) + continue + } else if resp.StatusCode >= 400 { + err = fmt.Errorf("diagnostics server returned status code %d", resp.StatusCode) + continue + } + + break + } + if err == nil { + // (remember, if there was an error, we return it + // below, so it will get logged if it's supposed to) + log.Println("[INFO] Sending diagnostics: success") + } + + // even if there was an error after retrying, we should + // schedule the next update using our default update + // interval because the server might be healthy later + + // schedule the next update (if this wasn't the last one and + // if the remote server didn't tell us to stop sending) + if !final && !reply.Stop { + updateTimerMu.Lock() + updateTimer = time.AfterFunc(reply.NextUpdate, func() { + logEmit(false) + }) + updateTimerMu.Unlock() + } + + return err +} + +// makePayloadAndResetBuffer prepares a payload +// by emptying the collection buffer. It returns +// the bytes of the payload to send to the server. +// Since the buffer is reset by this, if the +// resulting byte slice is lost, the payload is +// gone with it. +func makePayloadAndResetBuffer() ([]byte, error) { + // make a local pointer to the buffer, then reset + // the buffer to an empty map to clear it out + bufferMu.Lock() + bufCopy := buffer + buffer = make(map[string]interface{}) + bufferItemCount = 0 + bufferMu.Unlock() + + // encode payload in preparation for transmission + payload := Payload{ + InstanceID: instanceUUID.String(), + Timestamp: time.Now().UTC(), + Data: bufCopy, + } + return json.Marshal(payload) +} + +// Response contains the body of a response from the +// diagnostics server. +type Response struct { + // NextUpdate is how long to wait before the next update. + NextUpdate time.Duration `json:"next_update"` + + // Stop instructs the diagnostics server to stop sending + // diagnostics. This would only be done under extenuating + // circumstances, but we are prepared for it nonetheless. + Stop bool `json:"stop,omitempty"` + + // Error will be populated with an error message, if any. + // This field should be empty if the status code is < 400. + Error string `json:"error,omitempty"` +} + +// Payload is the data that gets sent to the diagnostics server. +type Payload struct { + // The universally unique ID of the instance + InstanceID string `json:"instance_id"` + + // The UTC timestamp of the transmission + Timestamp time.Time `json:"timestamp"` + + // The metrics + Data map[string]interface{} `json:"data,omitempty"` +} + +// httpClient should be used for HTTP requests. It +// is configured with a timeout for reliability. +var httpClient = http.Client{Timeout: 1 * time.Minute} + +// buffer holds the data that we are building up to send. +var buffer = make(map[string]interface{}) +var bufferItemCount = 0 +var bufferMu sync.RWMutex // protects both the buffer and its count + +// updating is used to ensure only one +// update happens at a time. +var updating bool +var updateMu sync.Mutex + +// updateTimer fires off the next update. +// If no update is scheduled, this is nil. +var updateTimer *time.Timer +var updateTimerMu sync.Mutex + +// instanceUUID is the ID of the current instance. +// This MUST be set to emit diagnostics. var instanceUUID uuid.UUID + +// enabled indicates whether the package has +// been initialized and can be actively used. +var enabled bool + +const ( + // endpoint is the base URL to remote diagnostics server; + // the instance ID will be appended to it. + endpoint = "http://localhost:8081/update/" + + // defaultUpdateInterval is how long to wait before emitting + // more diagnostic data. This value is only used if the + // client receives a nonsensical value, or doesn't send one + // at all, indicating a likely problem with the server. Thus, + // this value should be a long duration to help alleviate + // extra load on the server. + defaultUpdateInterval = 1 * time.Hour + + // maxBufferItems is the maximum number of items we'll allow + // in the buffer before we start dropping new ones, in a + // rough (simple) attempt to keep memory use under control. + maxBufferItems = 100000 +) diff --git a/vendor/github.com/klauspost/cpuid/LICENSE b/vendor/github.com/klauspost/cpuid/LICENSE new file mode 100644 index 00000000..5cec7ee9 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Klaus Post + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/klauspost/cpuid/cpuid.go b/vendor/github.com/klauspost/cpuid/cpuid.go new file mode 100644 index 00000000..44e50946 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/cpuid.go @@ -0,0 +1,1030 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// Package cpuid provides information about the CPU running the current program. +// +// CPU features are detected on startup, and kept for fast access through the life of the application. +// Currently x86 / x64 (AMD64) is supported. +// +// You can access the CPU information by accessing the shared CPU variable of the cpuid library. +// +// Package home: https://github.com/klauspost/cpuid +package cpuid + +import "strings" + +// Vendor is a representation of a CPU vendor. +type Vendor int + +const ( + Other Vendor = iota + Intel + AMD + VIA + Transmeta + NSC + KVM // Kernel-based Virtual Machine + MSVM // Microsoft Hyper-V or Windows Virtual PC + VMware + XenHVM +) + +const ( + CMOV = 1 << iota // i686 CMOV + NX // NX (No-Execute) bit + AMD3DNOW // AMD 3DNOW + AMD3DNOWEXT // AMD 3DNowExt + MMX // standard MMX + MMXEXT // SSE integer functions or AMD MMX ext + SSE // SSE functions + SSE2 // P4 SSE functions + SSE3 // Prescott SSE3 functions + SSSE3 // Conroe SSSE3 functions + SSE4 // Penryn SSE4.1 functions + SSE4A // AMD Barcelona microarchitecture SSE4a instructions + SSE42 // Nehalem SSE4.2 functions + AVX // AVX functions + AVX2 // AVX2 functions + FMA3 // Intel FMA 3 + FMA4 // Bulldozer FMA4 functions + XOP // Bulldozer XOP functions + F16C // Half-precision floating-point conversion + BMI1 // Bit Manipulation Instruction Set 1 + BMI2 // Bit Manipulation Instruction Set 2 + TBM // AMD Trailing Bit Manipulation + LZCNT // LZCNT instruction + POPCNT // POPCNT instruction + AESNI // Advanced Encryption Standard New Instructions + CLMUL // Carry-less Multiplication + HTT // Hyperthreading (enabled) + HLE // Hardware Lock Elision + RTM // Restricted Transactional Memory + RDRAND // RDRAND instruction is available + RDSEED // RDSEED instruction is available + ADX // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + SHA // Intel SHA Extensions + AVX512F // AVX-512 Foundation + AVX512DQ // AVX-512 Doubleword and Quadword Instructions + AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions + AVX512PF // AVX-512 Prefetch Instructions + AVX512ER // AVX-512 Exponential and Reciprocal Instructions + AVX512CD // AVX-512 Conflict Detection Instructions + AVX512BW // AVX-512 Byte and Word Instructions + AVX512VL // AVX-512 Vector Length Extensions + AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions + MPX // Intel MPX (Memory Protection Extensions) + ERMS // Enhanced REP MOVSB/STOSB + RDTSCP // RDTSCP Instruction + CX16 // CMPXCHG16B Instruction + SGX // Software Guard Extensions + + // Performance indicators + SSE2SLOW // SSE2 is supported, but usually not faster + SSE3SLOW // SSE3 is supported, but usually not faster + ATOM // Atom processor, some SSSE3 instructions are slower +) + +var flagNames = map[Flags]string{ + CMOV: "CMOV", // i686 CMOV + NX: "NX", // NX (No-Execute) bit + AMD3DNOW: "AMD3DNOW", // AMD 3DNOW + AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt + MMX: "MMX", // Standard MMX + MMXEXT: "MMXEXT", // SSE integer functions or AMD MMX ext + SSE: "SSE", // SSE functions + SSE2: "SSE2", // P4 SSE2 functions + SSE3: "SSE3", // Prescott SSE3 functions + SSSE3: "SSSE3", // Conroe SSSE3 functions + SSE4: "SSE4.1", // Penryn SSE4.1 functions + SSE4A: "SSE4A", // AMD Barcelona microarchitecture SSE4a instructions + SSE42: "SSE4.2", // Nehalem SSE4.2 functions + AVX: "AVX", // AVX functions + AVX2: "AVX2", // AVX functions + FMA3: "FMA3", // Intel FMA 3 + FMA4: "FMA4", // Bulldozer FMA4 functions + XOP: "XOP", // Bulldozer XOP functions + F16C: "F16C", // Half-precision floating-point conversion + BMI1: "BMI1", // Bit Manipulation Instruction Set 1 + BMI2: "BMI2", // Bit Manipulation Instruction Set 2 + TBM: "TBM", // AMD Trailing Bit Manipulation + LZCNT: "LZCNT", // LZCNT instruction + POPCNT: "POPCNT", // POPCNT instruction + AESNI: "AESNI", // Advanced Encryption Standard New Instructions + CLMUL: "CLMUL", // Carry-less Multiplication + HTT: "HTT", // Hyperthreading (enabled) + HLE: "HLE", // Hardware Lock Elision + RTM: "RTM", // Restricted Transactional Memory + RDRAND: "RDRAND", // RDRAND instruction is available + RDSEED: "RDSEED", // RDSEED instruction is available + ADX: "ADX", // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + SHA: "SHA", // Intel SHA Extensions + AVX512F: "AVX512F", // AVX-512 Foundation + AVX512DQ: "AVX512DQ", // AVX-512 Doubleword and Quadword Instructions + AVX512IFMA: "AVX512IFMA", // AVX-512 Integer Fused Multiply-Add Instructions + AVX512PF: "AVX512PF", // AVX-512 Prefetch Instructions + AVX512ER: "AVX512ER", // AVX-512 Exponential and Reciprocal Instructions + AVX512CD: "AVX512CD", // AVX-512 Conflict Detection Instructions + AVX512BW: "AVX512BW", // AVX-512 Byte and Word Instructions + AVX512VL: "AVX512VL", // AVX-512 Vector Length Extensions + AVX512VBMI: "AVX512VBMI", // AVX-512 Vector Bit Manipulation Instructions + MPX: "MPX", // Intel MPX (Memory Protection Extensions) + ERMS: "ERMS", // Enhanced REP MOVSB/STOSB + RDTSCP: "RDTSCP", // RDTSCP Instruction + CX16: "CX16", // CMPXCHG16B Instruction + SGX: "SGX", // Software Guard Extensions + + // Performance indicators + SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster + SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster + ATOM: "ATOM", // Atom processor, some SSSE3 instructions are slower + +} + +// CPUInfo contains information about the detected system CPU. +type CPUInfo struct { + BrandName string // Brand name reported by the CPU + VendorID Vendor // Comparable CPU vendor ID + Features Flags // Features of the CPU + PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable. + ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable. + LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. + Family int // CPU family number + Model int // CPU model number + CacheLine int // Cache line size in bytes. Will be 0 if undetectable. + Cache struct { + L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected + L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected + L2 int // L2 Cache (per core or shared). Will be -1 if undetected + L3 int // L3 Instruction Cache (per core or shared). Will be -1 if undetected + } + SGX SGXSupport + maxFunc uint32 + maxExFunc uint32 +} + +var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) +var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32) +var xgetbv func(index uint32) (eax, edx uint32) +var rdtscpAsm func() (eax, ebx, ecx, edx uint32) + +// CPU contains information about the CPU as detected on startup, +// or when Detect last was called. +// +// Use this as the primary entry point to you data, +// this way queries are +var CPU CPUInfo + +func init() { + initCPU() + Detect() +} + +// Detect will re-detect current CPU info. +// This will replace the content of the exported CPU variable. +// +// Unless you expect the CPU to change while you are running your program +// you should not need to call this function. +// If you call this, you must ensure that no other goroutine is accessing the +// exported CPU variable. +func Detect() { + CPU.maxFunc = maxFunctionID() + CPU.maxExFunc = maxExtendedFunction() + CPU.BrandName = brandName() + CPU.CacheLine = cacheLine() + CPU.Family, CPU.Model = familyModel() + CPU.Features = support() + CPU.SGX = hasSGX(CPU.Features&SGX != 0) + CPU.ThreadsPerCore = threadsPerCore() + CPU.LogicalCores = logicalCores() + CPU.PhysicalCores = physicalCores() + CPU.VendorID = vendorID() + CPU.cacheSize() +} + +// Generated here: http://play.golang.org/p/BxFH2Gdc0G + +// Cmov indicates support of CMOV instructions +func (c CPUInfo) Cmov() bool { + return c.Features&CMOV != 0 +} + +// Amd3dnow indicates support of AMD 3DNOW! instructions +func (c CPUInfo) Amd3dnow() bool { + return c.Features&AMD3DNOW != 0 +} + +// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions +func (c CPUInfo) Amd3dnowExt() bool { + return c.Features&AMD3DNOWEXT != 0 +} + +// MMX indicates support of MMX instructions +func (c CPUInfo) MMX() bool { + return c.Features&MMX != 0 +} + +// MMXExt indicates support of MMXEXT instructions +// (SSE integer functions or AMD MMX ext) +func (c CPUInfo) MMXExt() bool { + return c.Features&MMXEXT != 0 +} + +// SSE indicates support of SSE instructions +func (c CPUInfo) SSE() bool { + return c.Features&SSE != 0 +} + +// SSE2 indicates support of SSE 2 instructions +func (c CPUInfo) SSE2() bool { + return c.Features&SSE2 != 0 +} + +// SSE3 indicates support of SSE 3 instructions +func (c CPUInfo) SSE3() bool { + return c.Features&SSE3 != 0 +} + +// SSSE3 indicates support of SSSE 3 instructions +func (c CPUInfo) SSSE3() bool { + return c.Features&SSSE3 != 0 +} + +// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions +func (c CPUInfo) SSE4() bool { + return c.Features&SSE4 != 0 +} + +// SSE42 indicates support of SSE4.2 instructions +func (c CPUInfo) SSE42() bool { + return c.Features&SSE42 != 0 +} + +// AVX indicates support of AVX instructions +// and operating system support of AVX instructions +func (c CPUInfo) AVX() bool { + return c.Features&AVX != 0 +} + +// AVX2 indicates support of AVX2 instructions +func (c CPUInfo) AVX2() bool { + return c.Features&AVX2 != 0 +} + +// FMA3 indicates support of FMA3 instructions +func (c CPUInfo) FMA3() bool { + return c.Features&FMA3 != 0 +} + +// FMA4 indicates support of FMA4 instructions +func (c CPUInfo) FMA4() bool { + return c.Features&FMA4 != 0 +} + +// XOP indicates support of XOP instructions +func (c CPUInfo) XOP() bool { + return c.Features&XOP != 0 +} + +// F16C indicates support of F16C instructions +func (c CPUInfo) F16C() bool { + return c.Features&F16C != 0 +} + +// BMI1 indicates support of BMI1 instructions +func (c CPUInfo) BMI1() bool { + return c.Features&BMI1 != 0 +} + +// BMI2 indicates support of BMI2 instructions +func (c CPUInfo) BMI2() bool { + return c.Features&BMI2 != 0 +} + +// TBM indicates support of TBM instructions +// (AMD Trailing Bit Manipulation) +func (c CPUInfo) TBM() bool { + return c.Features&TBM != 0 +} + +// Lzcnt indicates support of LZCNT instruction +func (c CPUInfo) Lzcnt() bool { + return c.Features&LZCNT != 0 +} + +// Popcnt indicates support of POPCNT instruction +func (c CPUInfo) Popcnt() bool { + return c.Features&POPCNT != 0 +} + +// HTT indicates the processor has Hyperthreading enabled +func (c CPUInfo) HTT() bool { + return c.Features&HTT != 0 +} + +// SSE2Slow indicates that SSE2 may be slow on this processor +func (c CPUInfo) SSE2Slow() bool { + return c.Features&SSE2SLOW != 0 +} + +// SSE3Slow indicates that SSE3 may be slow on this processor +func (c CPUInfo) SSE3Slow() bool { + return c.Features&SSE3SLOW != 0 +} + +// AesNi indicates support of AES-NI instructions +// (Advanced Encryption Standard New Instructions) +func (c CPUInfo) AesNi() bool { + return c.Features&AESNI != 0 +} + +// Clmul indicates support of CLMUL instructions +// (Carry-less Multiplication) +func (c CPUInfo) Clmul() bool { + return c.Features&CLMUL != 0 +} + +// NX indicates support of NX (No-Execute) bit +func (c CPUInfo) NX() bool { + return c.Features&NX != 0 +} + +// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions +func (c CPUInfo) SSE4A() bool { + return c.Features&SSE4A != 0 +} + +// HLE indicates support of Hardware Lock Elision +func (c CPUInfo) HLE() bool { + return c.Features&HLE != 0 +} + +// RTM indicates support of Restricted Transactional Memory +func (c CPUInfo) RTM() bool { + return c.Features&RTM != 0 +} + +// Rdrand indicates support of RDRAND instruction is available +func (c CPUInfo) Rdrand() bool { + return c.Features&RDRAND != 0 +} + +// Rdseed indicates support of RDSEED instruction is available +func (c CPUInfo) Rdseed() bool { + return c.Features&RDSEED != 0 +} + +// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions) +func (c CPUInfo) ADX() bool { + return c.Features&ADX != 0 +} + +// SHA indicates support of Intel SHA Extensions +func (c CPUInfo) SHA() bool { + return c.Features&SHA != 0 +} + +// AVX512F indicates support of AVX-512 Foundation +func (c CPUInfo) AVX512F() bool { + return c.Features&AVX512F != 0 +} + +// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions +func (c CPUInfo) AVX512DQ() bool { + return c.Features&AVX512DQ != 0 +} + +// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions +func (c CPUInfo) AVX512IFMA() bool { + return c.Features&AVX512IFMA != 0 +} + +// AVX512PF indicates support of AVX-512 Prefetch Instructions +func (c CPUInfo) AVX512PF() bool { + return c.Features&AVX512PF != 0 +} + +// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions +func (c CPUInfo) AVX512ER() bool { + return c.Features&AVX512ER != 0 +} + +// AVX512CD indicates support of AVX-512 Conflict Detection Instructions +func (c CPUInfo) AVX512CD() bool { + return c.Features&AVX512CD != 0 +} + +// AVX512BW indicates support of AVX-512 Byte and Word Instructions +func (c CPUInfo) AVX512BW() bool { + return c.Features&AVX512BW != 0 +} + +// AVX512VL indicates support of AVX-512 Vector Length Extensions +func (c CPUInfo) AVX512VL() bool { + return c.Features&AVX512VL != 0 +} + +// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions +func (c CPUInfo) AVX512VBMI() bool { + return c.Features&AVX512VBMI != 0 +} + +// MPX indicates support of Intel MPX (Memory Protection Extensions) +func (c CPUInfo) MPX() bool { + return c.Features&MPX != 0 +} + +// ERMS indicates support of Enhanced REP MOVSB/STOSB +func (c CPUInfo) ERMS() bool { + return c.Features&ERMS != 0 +} + +// RDTSCP Instruction is available. +func (c CPUInfo) RDTSCP() bool { + return c.Features&RDTSCP != 0 +} + +// CX16 indicates if CMPXCHG16B instruction is available. +func (c CPUInfo) CX16() bool { + return c.Features&CX16 != 0 +} + +// TSX is split into HLE (Hardware Lock Elision) and RTM (Restricted Transactional Memory) detection. +// So TSX simply checks that. +func (c CPUInfo) TSX() bool { + return c.Features&(MPX|RTM) == MPX|RTM +} + +// Atom indicates an Atom processor +func (c CPUInfo) Atom() bool { + return c.Features&ATOM != 0 +} + +// Intel returns true if vendor is recognized as Intel +func (c CPUInfo) Intel() bool { + return c.VendorID == Intel +} + +// AMD returns true if vendor is recognized as AMD +func (c CPUInfo) AMD() bool { + return c.VendorID == AMD +} + +// Transmeta returns true if vendor is recognized as Transmeta +func (c CPUInfo) Transmeta() bool { + return c.VendorID == Transmeta +} + +// NSC returns true if vendor is recognized as National Semiconductor +func (c CPUInfo) NSC() bool { + return c.VendorID == NSC +} + +// VIA returns true if vendor is recognized as VIA +func (c CPUInfo) VIA() bool { + return c.VendorID == VIA +} + +// RTCounter returns the 64-bit time-stamp counter +// Uses the RDTSCP instruction. The value 0 is returned +// if the CPU does not support the instruction. +func (c CPUInfo) RTCounter() uint64 { + if !c.RDTSCP() { + return 0 + } + a, _, _, d := rdtscpAsm() + return uint64(a) | (uint64(d) << 32) +} + +// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP. +// This variable is OS dependent, but on Linux contains information +// about the current cpu/core the code is running on. +// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned. +func (c CPUInfo) Ia32TscAux() uint32 { + if !c.RDTSCP() { + return 0 + } + _, _, ecx, _ := rdtscpAsm() + return ecx +} + +// LogicalCPU will return the Logical CPU the code is currently executing on. +// This is likely to change when the OS re-schedules the running thread +// to another CPU. +// If the current core cannot be detected, -1 will be returned. +func (c CPUInfo) LogicalCPU() int { + if c.maxFunc < 1 { + return -1 + } + _, ebx, _, _ := cpuid(1) + return int(ebx >> 24) +} + +// VM Will return true if the cpu id indicates we are in +// a virtual machine. This is only a hint, and will very likely +// have many false negatives. +func (c CPUInfo) VM() bool { + switch c.VendorID { + case MSVM, KVM, VMware, XenHVM: + return true + } + return false +} + +// Flags contains detected cpu features and caracteristics +type Flags uint64 + +// String returns a string representation of the detected +// CPU features. +func (f Flags) String() string { + return strings.Join(f.Strings(), ",") +} + +// Strings returns and array of the detected features. +func (f Flags) Strings() []string { + s := support() + r := make([]string, 0, 20) + for i := uint(0); i < 64; i++ { + key := Flags(1 << i) + val := flagNames[key] + if s&key != 0 { + r = append(r, val) + } + } + return r +} + +func maxExtendedFunction() uint32 { + eax, _, _, _ := cpuid(0x80000000) + return eax +} + +func maxFunctionID() uint32 { + a, _, _, _ := cpuid(0) + return a +} + +func brandName() string { + if maxExtendedFunction() >= 0x80000004 { + v := make([]uint32, 0, 48) + for i := uint32(0); i < 3; i++ { + a, b, c, d := cpuid(0x80000002 + i) + v = append(v, a, b, c, d) + } + return strings.Trim(string(valAsString(v...)), " ") + } + return "unknown" +} + +func threadsPerCore() int { + mfi := maxFunctionID() + if mfi < 0x4 || vendorID() != Intel { + return 1 + } + + if mfi < 0xb { + _, b, _, d := cpuid(1) + if (d & (1 << 28)) != 0 { + // v will contain logical core count + v := (b >> 16) & 255 + if v > 1 { + a4, _, _, _ := cpuid(4) + // physical cores + v2 := (a4 >> 26) + 1 + if v2 > 0 { + return int(v) / int(v2) + } + } + } + return 1 + } + _, b, _, _ := cpuidex(0xb, 0) + if b&0xffff == 0 { + return 1 + } + return int(b & 0xffff) +} + +func logicalCores() int { + mfi := maxFunctionID() + switch vendorID() { + case Intel: + // Use this on old Intel processors + if mfi < 0xb { + if mfi < 1 { + return 0 + } + // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID) + // that can be assigned to logical processors in a physical package. + // The value may not be the same as the number of logical processors that are present in the hardware of a physical package. + _, ebx, _, _ := cpuid(1) + logical := (ebx >> 16) & 0xff + return int(logical) + } + _, b, _, _ := cpuidex(0xb, 1) + return int(b & 0xffff) + case AMD: + _, b, _, _ := cpuid(1) + return int((b >> 16) & 0xff) + default: + return 0 + } +} + +func familyModel() (int, int) { + if maxFunctionID() < 0x1 { + return 0, 0 + } + eax, _, _, _ := cpuid(1) + family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) + model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) + return int(family), int(model) +} + +func physicalCores() int { + switch vendorID() { + case Intel: + return logicalCores() / threadsPerCore() + case AMD: + if maxExtendedFunction() >= 0x80000008 { + _, _, c, _ := cpuid(0x80000008) + return int(c&0xff) + 1 + } + } + return 0 +} + +// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID +var vendorMapping = map[string]Vendor{ + "AMDisbetter!": AMD, + "AuthenticAMD": AMD, + "CentaurHauls": VIA, + "GenuineIntel": Intel, + "TransmetaCPU": Transmeta, + "GenuineTMx86": Transmeta, + "Geode by NSC": NSC, + "VIA VIA VIA ": VIA, + "KVMKVMKVMKVM": KVM, + "Microsoft Hv": MSVM, + "VMwareVMware": VMware, + "XenVMMXenVMM": XenHVM, +} + +func vendorID() Vendor { + _, b, c, d := cpuid(0) + v := valAsString(b, d, c) + vend, ok := vendorMapping[string(v)] + if !ok { + return Other + } + return vend +} + +func cacheLine() int { + if maxFunctionID() < 0x1 { + return 0 + } + + _, ebx, _, _ := cpuid(1) + cache := (ebx & 0xff00) >> 5 // cflush size + if cache == 0 && maxExtendedFunction() >= 0x80000006 { + _, _, ecx, _ := cpuid(0x80000006) + cache = ecx & 0xff // cacheline size + } + // TODO: Read from Cache and TLB Information + return int(cache) +} + +func (c *CPUInfo) cacheSize() { + c.Cache.L1D = -1 + c.Cache.L1I = -1 + c.Cache.L2 = -1 + c.Cache.L3 = -1 + vendor := vendorID() + switch vendor { + case Intel: + if maxFunctionID() < 4 { + return + } + for i := uint32(0); ; i++ { + eax, ebx, ecx, _ := cpuidex(4, i) + cacheType := eax & 15 + if cacheType == 0 { + break + } + cacheLevel := (eax >> 5) & 7 + coherency := int(ebx&0xfff) + 1 + partitions := int((ebx>>12)&0x3ff) + 1 + associativity := int((ebx>>22)&0x3ff) + 1 + sets := int(ecx) + 1 + size := associativity * partitions * coherency * sets + switch cacheLevel { + case 1: + if cacheType == 1 { + // 1 = Data Cache + c.Cache.L1D = size + } else if cacheType == 2 { + // 2 = Instruction Cache + c.Cache.L1I = size + } else { + if c.Cache.L1D < 0 { + c.Cache.L1I = size + } + if c.Cache.L1I < 0 { + c.Cache.L1I = size + } + } + case 2: + c.Cache.L2 = size + case 3: + c.Cache.L3 = size + } + } + case AMD: + // Untested. + if maxExtendedFunction() < 0x80000005 { + return + } + _, _, ecx, edx := cpuid(0x80000005) + c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024) + c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024) + + if maxExtendedFunction() < 0x80000006 { + return + } + _, _, ecx, _ = cpuid(0x80000006) + c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) + } + + return +} + +type SGXSupport struct { + Available bool + SGX1Supported bool + SGX2Supported bool + MaxEnclaveSizeNot64 int64 + MaxEnclaveSize64 int64 +} + +func hasSGX(available bool) (rval SGXSupport) { + rval.Available = available + + if !available { + return + } + + a, _, _, d := cpuidex(0x12, 0) + rval.SGX1Supported = a&0x01 != 0 + rval.SGX2Supported = a&0x02 != 0 + rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2 + rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2 + + return +} + +func support() Flags { + mfi := maxFunctionID() + vend := vendorID() + if mfi < 0x1 { + return 0 + } + rval := uint64(0) + _, _, c, d := cpuid(1) + if (d & (1 << 15)) != 0 { + rval |= CMOV + } + if (d & (1 << 23)) != 0 { + rval |= MMX + } + if (d & (1 << 25)) != 0 { + rval |= MMXEXT + } + if (d & (1 << 25)) != 0 { + rval |= SSE + } + if (d & (1 << 26)) != 0 { + rval |= SSE2 + } + if (c & 1) != 0 { + rval |= SSE3 + } + if (c & 0x00000200) != 0 { + rval |= SSSE3 + } + if (c & 0x00080000) != 0 { + rval |= SSE4 + } + if (c & 0x00100000) != 0 { + rval |= SSE42 + } + if (c & (1 << 25)) != 0 { + rval |= AESNI + } + if (c & (1 << 1)) != 0 { + rval |= CLMUL + } + if c&(1<<23) != 0 { + rval |= POPCNT + } + if c&(1<<30) != 0 { + rval |= RDRAND + } + if c&(1<<29) != 0 { + rval |= F16C + } + if c&(1<<13) != 0 { + rval |= CX16 + } + if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 { + if threadsPerCore() > 1 { + rval |= HTT + } + } + + // Check XGETBV, OXSAVE and AVX bits + if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 { + // Check for OS support + eax, _ := xgetbv(0) + if (eax & 0x6) == 0x6 { + rval |= AVX + if (c & 0x00001000) != 0 { + rval |= FMA3 + } + } + } + + // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. + if mfi >= 7 { + _, ebx, ecx, _ := cpuidex(7, 0) + if (rval&AVX) != 0 && (ebx&0x00000020) != 0 { + rval |= AVX2 + } + if (ebx & 0x00000008) != 0 { + rval |= BMI1 + if (ebx & 0x00000100) != 0 { + rval |= BMI2 + } + } + if ebx&(1<<2) != 0 { + rval |= SGX + } + if ebx&(1<<4) != 0 { + rval |= HLE + } + if ebx&(1<<9) != 0 { + rval |= ERMS + } + if ebx&(1<<11) != 0 { + rval |= RTM + } + if ebx&(1<<14) != 0 { + rval |= MPX + } + if ebx&(1<<18) != 0 { + rval |= RDSEED + } + if ebx&(1<<19) != 0 { + rval |= ADX + } + if ebx&(1<<29) != 0 { + rval |= SHA + } + + // Only detect AVX-512 features if XGETBV is supported + if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { + // Check for OS support + eax, _ := xgetbv(0) + + // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and + // ZMM16-ZMM31 state are enabled by OS) + /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). + if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 { + if ebx&(1<<16) != 0 { + rval |= AVX512F + } + if ebx&(1<<17) != 0 { + rval |= AVX512DQ + } + if ebx&(1<<21) != 0 { + rval |= AVX512IFMA + } + if ebx&(1<<26) != 0 { + rval |= AVX512PF + } + if ebx&(1<<27) != 0 { + rval |= AVX512ER + } + if ebx&(1<<28) != 0 { + rval |= AVX512CD + } + if ebx&(1<<30) != 0 { + rval |= AVX512BW + } + if ebx&(1<<31) != 0 { + rval |= AVX512VL + } + // ecx + if ecx&(1<<1) != 0 { + rval |= AVX512VBMI + } + } + } + } + + if maxExtendedFunction() >= 0x80000001 { + _, _, c, d := cpuid(0x80000001) + if (c & (1 << 5)) != 0 { + rval |= LZCNT + rval |= POPCNT + } + if (d & (1 << 31)) != 0 { + rval |= AMD3DNOW + } + if (d & (1 << 30)) != 0 { + rval |= AMD3DNOWEXT + } + if (d & (1 << 23)) != 0 { + rval |= MMX + } + if (d & (1 << 22)) != 0 { + rval |= MMXEXT + } + if (c & (1 << 6)) != 0 { + rval |= SSE4A + } + if d&(1<<20) != 0 { + rval |= NX + } + if d&(1<<27) != 0 { + rval |= RDTSCP + } + + /* Allow for selectively disabling SSE2 functions on AMD processors + with SSE2 support but not SSE4a. This includes Athlon64, some + Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster + than SSE2 often enough to utilize this special-case flag. + AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case + so that SSE2 is used unless explicitly disabled by checking + AV_CPU_FLAG_SSE2SLOW. */ + if vendorID() != Intel && + rval&SSE2 != 0 && (c&0x00000040) == 0 { + rval |= SSE2SLOW + } + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if (rval & AVX) != 0 { + if (c & 0x00000800) != 0 { + rval |= XOP + } + if (c & 0x00010000) != 0 { + rval |= FMA4 + } + } + + if vendorID() == Intel { + family, model := familyModel() + if family == 6 && (model == 9 || model == 13 || model == 14) { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and + * 6/14 (core1 "yonah") theoretically support sse2, but it's + * usually slower than mmx. */ + if (rval & SSE2) != 0 { + rval |= SSE2SLOW + } + if (rval & SSE3) != 0 { + rval |= SSE3SLOW + } + } + /* The Atom processor has SSSE3 support, which is useful in many cases, + * but sometimes the SSSE3 version is slower than the SSE2 equivalent + * on the Atom, but is generally faster on other processors supporting + * SSSE3. This flag allows for selectively disabling certain SSSE3 + * functions on the Atom. */ + if family == 6 && model == 28 { + rval |= ATOM + } + } + } + return Flags(rval) +} + +func valAsString(values ...uint32) []byte { + r := make([]byte, 4*len(values)) + for i, v := range values { + dst := r[i*4:] + dst[0] = byte(v & 0xff) + dst[1] = byte((v >> 8) & 0xff) + dst[2] = byte((v >> 16) & 0xff) + dst[3] = byte((v >> 24) & 0xff) + switch { + case dst[0] == 0: + return r[:i*4] + case dst[1] == 0: + return r[:i*4+1] + case dst[2] == 0: + return r[:i*4+2] + case dst[3] == 0: + return r[:i*4+3] + } + } + return r +} diff --git a/vendor/github.com/klauspost/cpuid/cpuid_386.s b/vendor/github.com/klauspost/cpuid/cpuid_386.s new file mode 100644 index 00000000..4d731711 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/cpuid_386.s @@ -0,0 +1,42 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build 386,!gccgo + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORL CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+4(FP) + MOVL BX, ebx+8(FP) + MOVL CX, ecx+12(FP) + MOVL DX, edx+16(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+4(FP) + MOVL DX, edx+8(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET diff --git a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s new file mode 100644 index 00000000..3c1d60e4 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s @@ -0,0 +1,42 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build amd64,!gccgo + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORQ CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmXgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+8(FP) + MOVL DX, edx+12(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET diff --git a/vendor/github.com/klauspost/cpuid/detect_intel.go b/vendor/github.com/klauspost/cpuid/detect_intel.go new file mode 100644 index 00000000..a5f04dd6 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/detect_intel.go @@ -0,0 +1,17 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build 386,!gccgo amd64,!gccgo + +package cpuid + +func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +func asmXgetbv(index uint32) (eax, edx uint32) +func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) + +func initCPU() { + cpuid = asmCpuid + cpuidex = asmCpuidex + xgetbv = asmXgetbv + rdtscpAsm = asmRdtscpAsm +} diff --git a/vendor/github.com/klauspost/cpuid/detect_ref.go b/vendor/github.com/klauspost/cpuid/detect_ref.go new file mode 100644 index 00000000..909c5d9a --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/detect_ref.go @@ -0,0 +1,23 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build !amd64,!386 gccgo + +package cpuid + +func initCPU() { + cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 + } + + cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 + } + + xgetbv = func(index uint32) (eax, edx uint32) { + return 0, 0 + } + + rdtscpAsm = func() (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 + } +} diff --git a/vendor/github.com/klauspost/cpuid/generate.go b/vendor/github.com/klauspost/cpuid/generate.go new file mode 100644 index 00000000..90e7a98d --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/generate.go @@ -0,0 +1,4 @@ +package cpuid + +//go:generate go run private-gen.go +//go:generate gofmt -w ./private diff --git a/vendor/github.com/klauspost/cpuid/private-gen.go b/vendor/github.com/klauspost/cpuid/private-gen.go new file mode 100644 index 00000000..437333d2 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/private-gen.go @@ -0,0 +1,476 @@ +// +build ignore + +package main + +import ( + "bytes" + "fmt" + "go/ast" + "go/parser" + "go/printer" + "go/token" + "io" + "io/ioutil" + "log" + "os" + "reflect" + "strings" + "unicode" + "unicode/utf8" +) + +var inFiles = []string{"cpuid.go", "cpuid_test.go"} +var copyFiles = []string{"cpuid_amd64.s", "cpuid_386.s", "detect_ref.go", "detect_intel.go"} +var fileSet = token.NewFileSet() +var reWrites = []rewrite{ + initRewrite("CPUInfo -> cpuInfo"), + initRewrite("Vendor -> vendor"), + initRewrite("Flags -> flags"), + initRewrite("Detect -> detect"), + initRewrite("CPU -> cpu"), +} +var excludeNames = map[string]bool{"string": true, "join": true, "trim": true, + // cpuid_test.go + "t": true, "println": true, "logf": true, "log": true, "fatalf": true, "fatal": true, +} + +var excludePrefixes = []string{"test", "benchmark"} + +func main() { + Package := "private" + parserMode := parser.ParseComments + exported := make(map[string]rewrite) + for _, file := range inFiles { + in, err := os.Open(file) + if err != nil { + log.Fatalf("opening input", err) + } + + src, err := ioutil.ReadAll(in) + if err != nil { + log.Fatalf("reading input", err) + } + + astfile, err := parser.ParseFile(fileSet, file, src, parserMode) + if err != nil { + log.Fatalf("parsing input", err) + } + + for _, rw := range reWrites { + astfile = rw(astfile) + } + + // Inspect the AST and print all identifiers and literals. + var startDecl token.Pos + var endDecl token.Pos + ast.Inspect(astfile, func(n ast.Node) bool { + var s string + switch x := n.(type) { + case *ast.Ident: + if x.IsExported() { + t := strings.ToLower(x.Name) + for _, pre := range excludePrefixes { + if strings.HasPrefix(t, pre) { + return true + } + } + if excludeNames[t] != true { + //if x.Pos() > startDecl && x.Pos() < endDecl { + exported[x.Name] = initRewrite(x.Name + " -> " + t) + } + } + + case *ast.GenDecl: + if x.Tok == token.CONST && x.Lparen > 0 { + startDecl = x.Lparen + endDecl = x.Rparen + // fmt.Printf("Decl:%s -> %s\n", fileSet.Position(startDecl), fileSet.Position(endDecl)) + } + } + if s != "" { + fmt.Printf("%s:\t%s\n", fileSet.Position(n.Pos()), s) + } + return true + }) + + for _, rw := range exported { + astfile = rw(astfile) + } + + var buf bytes.Buffer + + printer.Fprint(&buf, fileSet, astfile) + + // Remove package documentation and insert information + s := buf.String() + ind := strings.Index(buf.String(), "\npackage cpuid") + s = s[ind:] + s = "// Generated, DO NOT EDIT,\n" + + "// but copy it to your own project and rename the package.\n" + + "// See more at http://github.com/klauspost/cpuid\n" + + s + + outputName := Package + string(os.PathSeparator) + file + + err = ioutil.WriteFile(outputName, []byte(s), 0644) + if err != nil { + log.Fatalf("writing output: %s", err) + } + log.Println("Generated", outputName) + } + + for _, file := range copyFiles { + dst := "" + if strings.HasPrefix(file, "cpuid") { + dst = Package + string(os.PathSeparator) + file + } else { + dst = Package + string(os.PathSeparator) + "cpuid_" + file + } + err := copyFile(file, dst) + if err != nil { + log.Fatalf("copying file: %s", err) + } + log.Println("Copied", dst) + } +} + +// CopyFile copies a file from src to dst. If src and dst files exist, and are +// the same, then return success. Copy the file contents from src to dst. +func copyFile(src, dst string) (err error) { + sfi, err := os.Stat(src) + if err != nil { + return + } + if !sfi.Mode().IsRegular() { + // cannot copy non-regular files (e.g., directories, + // symlinks, devices, etc.) + return fmt.Errorf("CopyFile: non-regular source file %s (%q)", sfi.Name(), sfi.Mode().String()) + } + dfi, err := os.Stat(dst) + if err != nil { + if !os.IsNotExist(err) { + return + } + } else { + if !(dfi.Mode().IsRegular()) { + return fmt.Errorf("CopyFile: non-regular destination file %s (%q)", dfi.Name(), dfi.Mode().String()) + } + if os.SameFile(sfi, dfi) { + return + } + } + err = copyFileContents(src, dst) + return +} + +// copyFileContents copies the contents of the file named src to the file named +// by dst. The file will be created if it does not already exist. If the +// destination file exists, all it's contents will be replaced by the contents +// of the source file. +func copyFileContents(src, dst string) (err error) { + in, err := os.Open(src) + if err != nil { + return + } + defer in.Close() + out, err := os.Create(dst) + if err != nil { + return + } + defer func() { + cerr := out.Close() + if err == nil { + err = cerr + } + }() + if _, err = io.Copy(out, in); err != nil { + return + } + err = out.Sync() + return +} + +type rewrite func(*ast.File) *ast.File + +// Mostly copied from gofmt +func initRewrite(rewriteRule string) rewrite { + f := strings.Split(rewriteRule, "->") + if len(f) != 2 { + fmt.Fprintf(os.Stderr, "rewrite rule must be of the form 'pattern -> replacement'\n") + os.Exit(2) + } + pattern := parseExpr(f[0], "pattern") + replace := parseExpr(f[1], "replacement") + return func(p *ast.File) *ast.File { return rewriteFile(pattern, replace, p) } +} + +// parseExpr parses s as an expression. +// It might make sense to expand this to allow statement patterns, +// but there are problems with preserving formatting and also +// with what a wildcard for a statement looks like. +func parseExpr(s, what string) ast.Expr { + x, err := parser.ParseExpr(s) + if err != nil { + fmt.Fprintf(os.Stderr, "parsing %s %s at %s\n", what, s, err) + os.Exit(2) + } + return x +} + +// Keep this function for debugging. +/* +func dump(msg string, val reflect.Value) { + fmt.Printf("%s:\n", msg) + ast.Print(fileSet, val.Interface()) + fmt.Println() +} +*/ + +// rewriteFile applies the rewrite rule 'pattern -> replace' to an entire file. +func rewriteFile(pattern, replace ast.Expr, p *ast.File) *ast.File { + cmap := ast.NewCommentMap(fileSet, p, p.Comments) + m := make(map[string]reflect.Value) + pat := reflect.ValueOf(pattern) + repl := reflect.ValueOf(replace) + + var rewriteVal func(val reflect.Value) reflect.Value + rewriteVal = func(val reflect.Value) reflect.Value { + // don't bother if val is invalid to start with + if !val.IsValid() { + return reflect.Value{} + } + for k := range m { + delete(m, k) + } + val = apply(rewriteVal, val) + if match(m, pat, val) { + val = subst(m, repl, reflect.ValueOf(val.Interface().(ast.Node).Pos())) + } + return val + } + + r := apply(rewriteVal, reflect.ValueOf(p)).Interface().(*ast.File) + r.Comments = cmap.Filter(r).Comments() // recreate comments list + return r +} + +// set is a wrapper for x.Set(y); it protects the caller from panics if x cannot be changed to y. +func set(x, y reflect.Value) { + // don't bother if x cannot be set or y is invalid + if !x.CanSet() || !y.IsValid() { + return + } + defer func() { + if x := recover(); x != nil { + if s, ok := x.(string); ok && + (strings.Contains(s, "type mismatch") || strings.Contains(s, "not assignable")) { + // x cannot be set to y - ignore this rewrite + return + } + panic(x) + } + }() + x.Set(y) +} + +// Values/types for special cases. +var ( + objectPtrNil = reflect.ValueOf((*ast.Object)(nil)) + scopePtrNil = reflect.ValueOf((*ast.Scope)(nil)) + + identType = reflect.TypeOf((*ast.Ident)(nil)) + objectPtrType = reflect.TypeOf((*ast.Object)(nil)) + positionType = reflect.TypeOf(token.NoPos) + callExprType = reflect.TypeOf((*ast.CallExpr)(nil)) + scopePtrType = reflect.TypeOf((*ast.Scope)(nil)) +) + +// apply replaces each AST field x in val with f(x), returning val. +// To avoid extra conversions, f operates on the reflect.Value form. +func apply(f func(reflect.Value) reflect.Value, val reflect.Value) reflect.Value { + if !val.IsValid() { + return reflect.Value{} + } + + // *ast.Objects introduce cycles and are likely incorrect after + // rewrite; don't follow them but replace with nil instead + if val.Type() == objectPtrType { + return objectPtrNil + } + + // similarly for scopes: they are likely incorrect after a rewrite; + // replace them with nil + if val.Type() == scopePtrType { + return scopePtrNil + } + + switch v := reflect.Indirect(val); v.Kind() { + case reflect.Slice: + for i := 0; i < v.Len(); i++ { + e := v.Index(i) + set(e, f(e)) + } + case reflect.Struct: + for i := 0; i < v.NumField(); i++ { + e := v.Field(i) + set(e, f(e)) + } + case reflect.Interface: + e := v.Elem() + set(v, f(e)) + } + return val +} + +func isWildcard(s string) bool { + rune, size := utf8.DecodeRuneInString(s) + return size == len(s) && unicode.IsLower(rune) +} + +// match returns true if pattern matches val, +// recording wildcard submatches in m. +// If m == nil, match checks whether pattern == val. +func match(m map[string]reflect.Value, pattern, val reflect.Value) bool { + // Wildcard matches any expression. If it appears multiple + // times in the pattern, it must match the same expression + // each time. + if m != nil && pattern.IsValid() && pattern.Type() == identType { + name := pattern.Interface().(*ast.Ident).Name + if isWildcard(name) && val.IsValid() { + // wildcards only match valid (non-nil) expressions. + if _, ok := val.Interface().(ast.Expr); ok && !val.IsNil() { + if old, ok := m[name]; ok { + return match(nil, old, val) + } + m[name] = val + return true + } + } + } + + // Otherwise, pattern and val must match recursively. + if !pattern.IsValid() || !val.IsValid() { + return !pattern.IsValid() && !val.IsValid() + } + if pattern.Type() != val.Type() { + return false + } + + // Special cases. + switch pattern.Type() { + case identType: + // For identifiers, only the names need to match + // (and none of the other *ast.Object information). + // This is a common case, handle it all here instead + // of recursing down any further via reflection. + p := pattern.Interface().(*ast.Ident) + v := val.Interface().(*ast.Ident) + return p == nil && v == nil || p != nil && v != nil && p.Name == v.Name + case objectPtrType, positionType: + // object pointers and token positions always match + return true + case callExprType: + // For calls, the Ellipsis fields (token.Position) must + // match since that is how f(x) and f(x...) are different. + // Check them here but fall through for the remaining fields. + p := pattern.Interface().(*ast.CallExpr) + v := val.Interface().(*ast.CallExpr) + if p.Ellipsis.IsValid() != v.Ellipsis.IsValid() { + return false + } + } + + p := reflect.Indirect(pattern) + v := reflect.Indirect(val) + if !p.IsValid() || !v.IsValid() { + return !p.IsValid() && !v.IsValid() + } + + switch p.Kind() { + case reflect.Slice: + if p.Len() != v.Len() { + return false + } + for i := 0; i < p.Len(); i++ { + if !match(m, p.Index(i), v.Index(i)) { + return false + } + } + return true + + case reflect.Struct: + for i := 0; i < p.NumField(); i++ { + if !match(m, p.Field(i), v.Field(i)) { + return false + } + } + return true + + case reflect.Interface: + return match(m, p.Elem(), v.Elem()) + } + + // Handle token integers, etc. + return p.Interface() == v.Interface() +} + +// subst returns a copy of pattern with values from m substituted in place +// of wildcards and pos used as the position of tokens from the pattern. +// if m == nil, subst returns a copy of pattern and doesn't change the line +// number information. +func subst(m map[string]reflect.Value, pattern reflect.Value, pos reflect.Value) reflect.Value { + if !pattern.IsValid() { + return reflect.Value{} + } + + // Wildcard gets replaced with map value. + if m != nil && pattern.Type() == identType { + name := pattern.Interface().(*ast.Ident).Name + if isWildcard(name) { + if old, ok := m[name]; ok { + return subst(nil, old, reflect.Value{}) + } + } + } + + if pos.IsValid() && pattern.Type() == positionType { + // use new position only if old position was valid in the first place + if old := pattern.Interface().(token.Pos); !old.IsValid() { + return pattern + } + return pos + } + + // Otherwise copy. + switch p := pattern; p.Kind() { + case reflect.Slice: + v := reflect.MakeSlice(p.Type(), p.Len(), p.Len()) + for i := 0; i < p.Len(); i++ { + v.Index(i).Set(subst(m, p.Index(i), pos)) + } + return v + + case reflect.Struct: + v := reflect.New(p.Type()).Elem() + for i := 0; i < p.NumField(); i++ { + v.Field(i).Set(subst(m, p.Field(i), pos)) + } + return v + + case reflect.Ptr: + v := reflect.New(p.Type()).Elem() + if elem := p.Elem(); elem.IsValid() { + v.Set(subst(m, elem, pos).Addr()) + } + return v + + case reflect.Interface: + v := reflect.New(p.Type()).Elem() + if elem := p.Elem(); elem.IsValid() { + v.Set(subst(m, elem, pos)) + } + return v + } + + return pattern +} diff --git a/vendor/github.com/klauspost/cpuid/private/cpuid.go b/vendor/github.com/klauspost/cpuid/private/cpuid.go new file mode 100644 index 00000000..21712142 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/private/cpuid.go @@ -0,0 +1,1024 @@ +// Generated, DO NOT EDIT, +// but copy it to your own project and rename the package. +// See more at http://github.com/klauspost/cpuid + +package cpuid + +import "strings" + +// Vendor is a representation of a CPU vendor. +type vendor int + +const ( + other vendor = iota + intel + amd + via + transmeta + nsc + kvm // Kernel-based Virtual Machine + msvm // Microsoft Hyper-V or Windows Virtual PC + vmware + xenhvm +) + +const ( + cmov = 1 << iota // i686 CMOV + nx // NX (No-Execute) bit + amd3dnow // AMD 3DNOW + amd3dnowext // AMD 3DNowExt + mmx // standard MMX + mmxext // SSE integer functions or AMD MMX ext + sse // SSE functions + sse2 // P4 SSE functions + sse3 // Prescott SSE3 functions + ssse3 // Conroe SSSE3 functions + sse4 // Penryn SSE4.1 functions + sse4a // AMD Barcelona microarchitecture SSE4a instructions + sse42 // Nehalem SSE4.2 functions + avx // AVX functions + avx2 // AVX2 functions + fma3 // Intel FMA 3 + fma4 // Bulldozer FMA4 functions + xop // Bulldozer XOP functions + f16c // Half-precision floating-point conversion + bmi1 // Bit Manipulation Instruction Set 1 + bmi2 // Bit Manipulation Instruction Set 2 + tbm // AMD Trailing Bit Manipulation + lzcnt // LZCNT instruction + popcnt // POPCNT instruction + aesni // Advanced Encryption Standard New Instructions + clmul // Carry-less Multiplication + htt // Hyperthreading (enabled) + hle // Hardware Lock Elision + rtm // Restricted Transactional Memory + rdrand // RDRAND instruction is available + rdseed // RDSEED instruction is available + adx // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + sha // Intel SHA Extensions + avx512f // AVX-512 Foundation + avx512dq // AVX-512 Doubleword and Quadword Instructions + avx512ifma // AVX-512 Integer Fused Multiply-Add Instructions + avx512pf // AVX-512 Prefetch Instructions + avx512er // AVX-512 Exponential and Reciprocal Instructions + avx512cd // AVX-512 Conflict Detection Instructions + avx512bw // AVX-512 Byte and Word Instructions + avx512vl // AVX-512 Vector Length Extensions + avx512vbmi // AVX-512 Vector Bit Manipulation Instructions + mpx // Intel MPX (Memory Protection Extensions) + erms // Enhanced REP MOVSB/STOSB + rdtscp // RDTSCP Instruction + cx16 // CMPXCHG16B Instruction + sgx // Software Guard Extensions + + // Performance indicators + sse2slow // SSE2 is supported, but usually not faster + sse3slow // SSE3 is supported, but usually not faster + atom // Atom processor, some SSSE3 instructions are slower +) + +var flagNames = map[flags]string{ + cmov: "CMOV", // i686 CMOV + nx: "NX", // NX (No-Execute) bit + amd3dnow: "AMD3DNOW", // AMD 3DNOW + amd3dnowext: "AMD3DNOWEXT", // AMD 3DNowExt + mmx: "MMX", // Standard MMX + mmxext: "MMXEXT", // SSE integer functions or AMD MMX ext + sse: "SSE", // SSE functions + sse2: "SSE2", // P4 SSE2 functions + sse3: "SSE3", // Prescott SSE3 functions + ssse3: "SSSE3", // Conroe SSSE3 functions + sse4: "SSE4.1", // Penryn SSE4.1 functions + sse4a: "SSE4A", // AMD Barcelona microarchitecture SSE4a instructions + sse42: "SSE4.2", // Nehalem SSE4.2 functions + avx: "AVX", // AVX functions + avx2: "AVX2", // AVX functions + fma3: "FMA3", // Intel FMA 3 + fma4: "FMA4", // Bulldozer FMA4 functions + xop: "XOP", // Bulldozer XOP functions + f16c: "F16C", // Half-precision floating-point conversion + bmi1: "BMI1", // Bit Manipulation Instruction Set 1 + bmi2: "BMI2", // Bit Manipulation Instruction Set 2 + tbm: "TBM", // AMD Trailing Bit Manipulation + lzcnt: "LZCNT", // LZCNT instruction + popcnt: "POPCNT", // POPCNT instruction + aesni: "AESNI", // Advanced Encryption Standard New Instructions + clmul: "CLMUL", // Carry-less Multiplication + htt: "HTT", // Hyperthreading (enabled) + hle: "HLE", // Hardware Lock Elision + rtm: "RTM", // Restricted Transactional Memory + rdrand: "RDRAND", // RDRAND instruction is available + rdseed: "RDSEED", // RDSEED instruction is available + adx: "ADX", // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + sha: "SHA", // Intel SHA Extensions + avx512f: "AVX512F", // AVX-512 Foundation + avx512dq: "AVX512DQ", // AVX-512 Doubleword and Quadword Instructions + avx512ifma: "AVX512IFMA", // AVX-512 Integer Fused Multiply-Add Instructions + avx512pf: "AVX512PF", // AVX-512 Prefetch Instructions + avx512er: "AVX512ER", // AVX-512 Exponential and Reciprocal Instructions + avx512cd: "AVX512CD", // AVX-512 Conflict Detection Instructions + avx512bw: "AVX512BW", // AVX-512 Byte and Word Instructions + avx512vl: "AVX512VL", // AVX-512 Vector Length Extensions + avx512vbmi: "AVX512VBMI", // AVX-512 Vector Bit Manipulation Instructions + mpx: "MPX", // Intel MPX (Memory Protection Extensions) + erms: "ERMS", // Enhanced REP MOVSB/STOSB + rdtscp: "RDTSCP", // RDTSCP Instruction + cx16: "CX16", // CMPXCHG16B Instruction + sgx: "SGX", // Software Guard Extensions + + // Performance indicators + sse2slow: "SSE2SLOW", // SSE2 supported, but usually not faster + sse3slow: "SSE3SLOW", // SSE3 supported, but usually not faster + atom: "ATOM", // Atom processor, some SSSE3 instructions are slower + +} + +// CPUInfo contains information about the detected system CPU. +type cpuInfo struct { + brandname string // Brand name reported by the CPU + vendorid vendor // Comparable CPU vendor ID + features flags // Features of the CPU + physicalcores int // Number of physical processor cores in your CPU. Will be 0 if undetectable. + threadspercore int // Number of threads per physical core. Will be 1 if undetectable. + logicalcores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. + family int // CPU family number + model int // CPU model number + cacheline int // Cache line size in bytes. Will be 0 if undetectable. + cache struct { + l1i int // L1 Instruction Cache (per core or shared). Will be -1 if undetected + l1d int // L1 Data Cache (per core or shared). Will be -1 if undetected + l2 int // L2 Cache (per core or shared). Will be -1 if undetected + l3 int // L3 Instruction Cache (per core or shared). Will be -1 if undetected + } + sgx sgxsupport + maxFunc uint32 + maxExFunc uint32 +} + +var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) +var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32) +var xgetbv func(index uint32) (eax, edx uint32) +var rdtscpAsm func() (eax, ebx, ecx, edx uint32) + +// CPU contains information about the CPU as detected on startup, +// or when Detect last was called. +// +// Use this as the primary entry point to you data, +// this way queries are +var cpu cpuInfo + +func init() { + initCPU() + detect() +} + +// Detect will re-detect current CPU info. +// This will replace the content of the exported CPU variable. +// +// Unless you expect the CPU to change while you are running your program +// you should not need to call this function. +// If you call this, you must ensure that no other goroutine is accessing the +// exported CPU variable. +func detect() { + cpu.maxFunc = maxFunctionID() + cpu.maxExFunc = maxExtendedFunction() + cpu.brandname = brandName() + cpu.cacheline = cacheLine() + cpu.family, cpu.model = familyModel() + cpu.features = support() + cpu.sgx = hasSGX(cpu.features&sgx != 0) + cpu.threadspercore = threadsPerCore() + cpu.logicalcores = logicalCores() + cpu.physicalcores = physicalCores() + cpu.vendorid = vendorID() + cpu.cacheSize() +} + +// Generated here: http://play.golang.org/p/BxFH2Gdc0G + +// Cmov indicates support of CMOV instructions +func (c cpuInfo) cmov() bool { + return c.features&cmov != 0 +} + +// Amd3dnow indicates support of AMD 3DNOW! instructions +func (c cpuInfo) amd3dnow() bool { + return c.features&amd3dnow != 0 +} + +// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions +func (c cpuInfo) amd3dnowext() bool { + return c.features&amd3dnowext != 0 +} + +// MMX indicates support of MMX instructions +func (c cpuInfo) mmx() bool { + return c.features&mmx != 0 +} + +// MMXExt indicates support of MMXEXT instructions +// (SSE integer functions or AMD MMX ext) +func (c cpuInfo) mmxext() bool { + return c.features&mmxext != 0 +} + +// SSE indicates support of SSE instructions +func (c cpuInfo) sse() bool { + return c.features&sse != 0 +} + +// SSE2 indicates support of SSE 2 instructions +func (c cpuInfo) sse2() bool { + return c.features&sse2 != 0 +} + +// SSE3 indicates support of SSE 3 instructions +func (c cpuInfo) sse3() bool { + return c.features&sse3 != 0 +} + +// SSSE3 indicates support of SSSE 3 instructions +func (c cpuInfo) ssse3() bool { + return c.features&ssse3 != 0 +} + +// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions +func (c cpuInfo) sse4() bool { + return c.features&sse4 != 0 +} + +// SSE42 indicates support of SSE4.2 instructions +func (c cpuInfo) sse42() bool { + return c.features&sse42 != 0 +} + +// AVX indicates support of AVX instructions +// and operating system support of AVX instructions +func (c cpuInfo) avx() bool { + return c.features&avx != 0 +} + +// AVX2 indicates support of AVX2 instructions +func (c cpuInfo) avx2() bool { + return c.features&avx2 != 0 +} + +// FMA3 indicates support of FMA3 instructions +func (c cpuInfo) fma3() bool { + return c.features&fma3 != 0 +} + +// FMA4 indicates support of FMA4 instructions +func (c cpuInfo) fma4() bool { + return c.features&fma4 != 0 +} + +// XOP indicates support of XOP instructions +func (c cpuInfo) xop() bool { + return c.features&xop != 0 +} + +// F16C indicates support of F16C instructions +func (c cpuInfo) f16c() bool { + return c.features&f16c != 0 +} + +// BMI1 indicates support of BMI1 instructions +func (c cpuInfo) bmi1() bool { + return c.features&bmi1 != 0 +} + +// BMI2 indicates support of BMI2 instructions +func (c cpuInfo) bmi2() bool { + return c.features&bmi2 != 0 +} + +// TBM indicates support of TBM instructions +// (AMD Trailing Bit Manipulation) +func (c cpuInfo) tbm() bool { + return c.features&tbm != 0 +} + +// Lzcnt indicates support of LZCNT instruction +func (c cpuInfo) lzcnt() bool { + return c.features&lzcnt != 0 +} + +// Popcnt indicates support of POPCNT instruction +func (c cpuInfo) popcnt() bool { + return c.features&popcnt != 0 +} + +// HTT indicates the processor has Hyperthreading enabled +func (c cpuInfo) htt() bool { + return c.features&htt != 0 +} + +// SSE2Slow indicates that SSE2 may be slow on this processor +func (c cpuInfo) sse2slow() bool { + return c.features&sse2slow != 0 +} + +// SSE3Slow indicates that SSE3 may be slow on this processor +func (c cpuInfo) sse3slow() bool { + return c.features&sse3slow != 0 +} + +// AesNi indicates support of AES-NI instructions +// (Advanced Encryption Standard New Instructions) +func (c cpuInfo) aesni() bool { + return c.features&aesni != 0 +} + +// Clmul indicates support of CLMUL instructions +// (Carry-less Multiplication) +func (c cpuInfo) clmul() bool { + return c.features&clmul != 0 +} + +// NX indicates support of NX (No-Execute) bit +func (c cpuInfo) nx() bool { + return c.features&nx != 0 +} + +// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions +func (c cpuInfo) sse4a() bool { + return c.features&sse4a != 0 +} + +// HLE indicates support of Hardware Lock Elision +func (c cpuInfo) hle() bool { + return c.features&hle != 0 +} + +// RTM indicates support of Restricted Transactional Memory +func (c cpuInfo) rtm() bool { + return c.features&rtm != 0 +} + +// Rdrand indicates support of RDRAND instruction is available +func (c cpuInfo) rdrand() bool { + return c.features&rdrand != 0 +} + +// Rdseed indicates support of RDSEED instruction is available +func (c cpuInfo) rdseed() bool { + return c.features&rdseed != 0 +} + +// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions) +func (c cpuInfo) adx() bool { + return c.features&adx != 0 +} + +// SHA indicates support of Intel SHA Extensions +func (c cpuInfo) sha() bool { + return c.features&sha != 0 +} + +// AVX512F indicates support of AVX-512 Foundation +func (c cpuInfo) avx512f() bool { + return c.features&avx512f != 0 +} + +// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions +func (c cpuInfo) avx512dq() bool { + return c.features&avx512dq != 0 +} + +// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions +func (c cpuInfo) avx512ifma() bool { + return c.features&avx512ifma != 0 +} + +// AVX512PF indicates support of AVX-512 Prefetch Instructions +func (c cpuInfo) avx512pf() bool { + return c.features&avx512pf != 0 +} + +// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions +func (c cpuInfo) avx512er() bool { + return c.features&avx512er != 0 +} + +// AVX512CD indicates support of AVX-512 Conflict Detection Instructions +func (c cpuInfo) avx512cd() bool { + return c.features&avx512cd != 0 +} + +// AVX512BW indicates support of AVX-512 Byte and Word Instructions +func (c cpuInfo) avx512bw() bool { + return c.features&avx512bw != 0 +} + +// AVX512VL indicates support of AVX-512 Vector Length Extensions +func (c cpuInfo) avx512vl() bool { + return c.features&avx512vl != 0 +} + +// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions +func (c cpuInfo) avx512vbmi() bool { + return c.features&avx512vbmi != 0 +} + +// MPX indicates support of Intel MPX (Memory Protection Extensions) +func (c cpuInfo) mpx() bool { + return c.features&mpx != 0 +} + +// ERMS indicates support of Enhanced REP MOVSB/STOSB +func (c cpuInfo) erms() bool { + return c.features&erms != 0 +} + +// RDTSCP Instruction is available. +func (c cpuInfo) rdtscp() bool { + return c.features&rdtscp != 0 +} + +// CX16 indicates if CMPXCHG16B instruction is available. +func (c cpuInfo) cx16() bool { + return c.features&cx16 != 0 +} + +// TSX is split into HLE (Hardware Lock Elision) and RTM (Restricted Transactional Memory) detection. +// So TSX simply checks that. +func (c cpuInfo) tsx() bool { + return c.features&(mpx|rtm) == mpx|rtm +} + +// Atom indicates an Atom processor +func (c cpuInfo) atom() bool { + return c.features&atom != 0 +} + +// Intel returns true if vendor is recognized as Intel +func (c cpuInfo) intel() bool { + return c.vendorid == intel +} + +// AMD returns true if vendor is recognized as AMD +func (c cpuInfo) amd() bool { + return c.vendorid == amd +} + +// Transmeta returns true if vendor is recognized as Transmeta +func (c cpuInfo) transmeta() bool { + return c.vendorid == transmeta +} + +// NSC returns true if vendor is recognized as National Semiconductor +func (c cpuInfo) nsc() bool { + return c.vendorid == nsc +} + +// VIA returns true if vendor is recognized as VIA +func (c cpuInfo) via() bool { + return c.vendorid == via +} + +// RTCounter returns the 64-bit time-stamp counter +// Uses the RDTSCP instruction. The value 0 is returned +// if the CPU does not support the instruction. +func (c cpuInfo) rtcounter() uint64 { + if !c.rdtscp() { + return 0 + } + a, _, _, d := rdtscpAsm() + return uint64(a) | (uint64(d) << 32) +} + +// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP. +// This variable is OS dependent, but on Linux contains information +// about the current cpu/core the code is running on. +// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned. +func (c cpuInfo) ia32tscaux() uint32 { + if !c.rdtscp() { + return 0 + } + _, _, ecx, _ := rdtscpAsm() + return ecx +} + +// LogicalCPU will return the Logical CPU the code is currently executing on. +// This is likely to change when the OS re-schedules the running thread +// to another CPU. +// If the current core cannot be detected, -1 will be returned. +func (c cpuInfo) logicalcpu() int { + if c.maxFunc < 1 { + return -1 + } + _, ebx, _, _ := cpuid(1) + return int(ebx >> 24) +} + +// VM Will return true if the cpu id indicates we are in +// a virtual machine. This is only a hint, and will very likely +// have many false negatives. +func (c cpuInfo) vm() bool { + switch c.vendorid { + case msvm, kvm, vmware, xenhvm: + return true + } + return false +} + +// Flags contains detected cpu features and caracteristics +type flags uint64 + +// String returns a string representation of the detected +// CPU features. +func (f flags) String() string { + return strings.Join(f.strings(), ",") +} + +// Strings returns and array of the detected features. +func (f flags) strings() []string { + s := support() + r := make([]string, 0, 20) + for i := uint(0); i < 64; i++ { + key := flags(1 << i) + val := flagNames[key] + if s&key != 0 { + r = append(r, val) + } + } + return r +} + +func maxExtendedFunction() uint32 { + eax, _, _, _ := cpuid(0x80000000) + return eax +} + +func maxFunctionID() uint32 { + a, _, _, _ := cpuid(0) + return a +} + +func brandName() string { + if maxExtendedFunction() >= 0x80000004 { + v := make([]uint32, 0, 48) + for i := uint32(0); i < 3; i++ { + a, b, c, d := cpuid(0x80000002 + i) + v = append(v, a, b, c, d) + } + return strings.Trim(string(valAsString(v...)), " ") + } + return "unknown" +} + +func threadsPerCore() int { + mfi := maxFunctionID() + if mfi < 0x4 || vendorID() != intel { + return 1 + } + + if mfi < 0xb { + _, b, _, d := cpuid(1) + if (d & (1 << 28)) != 0 { + // v will contain logical core count + v := (b >> 16) & 255 + if v > 1 { + a4, _, _, _ := cpuid(4) + // physical cores + v2 := (a4 >> 26) + 1 + if v2 > 0 { + return int(v) / int(v2) + } + } + } + return 1 + } + _, b, _, _ := cpuidex(0xb, 0) + if b&0xffff == 0 { + return 1 + } + return int(b & 0xffff) +} + +func logicalCores() int { + mfi := maxFunctionID() + switch vendorID() { + case intel: + // Use this on old Intel processors + if mfi < 0xb { + if mfi < 1 { + return 0 + } + // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID) + // that can be assigned to logical processors in a physical package. + // The value may not be the same as the number of logical processors that are present in the hardware of a physical package. + _, ebx, _, _ := cpuid(1) + logical := (ebx >> 16) & 0xff + return int(logical) + } + _, b, _, _ := cpuidex(0xb, 1) + return int(b & 0xffff) + case amd: + _, b, _, _ := cpuid(1) + return int((b >> 16) & 0xff) + default: + return 0 + } +} + +func familyModel() (int, int) { + if maxFunctionID() < 0x1 { + return 0, 0 + } + eax, _, _, _ := cpuid(1) + family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) + model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) + return int(family), int(model) +} + +func physicalCores() int { + switch vendorID() { + case intel: + return logicalCores() / threadsPerCore() + case amd: + if maxExtendedFunction() >= 0x80000008 { + _, _, c, _ := cpuid(0x80000008) + return int(c&0xff) + 1 + } + } + return 0 +} + +// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID +var vendorMapping = map[string]vendor{ + "AMDisbetter!": amd, + "AuthenticAMD": amd, + "CentaurHauls": via, + "GenuineIntel": intel, + "TransmetaCPU": transmeta, + "GenuineTMx86": transmeta, + "Geode by NSC": nsc, + "VIA VIA VIA ": via, + "KVMKVMKVMKVM": kvm, + "Microsoft Hv": msvm, + "VMwareVMware": vmware, + "XenVMMXenVMM": xenhvm, +} + +func vendorID() vendor { + _, b, c, d := cpuid(0) + v := valAsString(b, d, c) + vend, ok := vendorMapping[string(v)] + if !ok { + return other + } + return vend +} + +func cacheLine() int { + if maxFunctionID() < 0x1 { + return 0 + } + + _, ebx, _, _ := cpuid(1) + cache := (ebx & 0xff00) >> 5 // cflush size + if cache == 0 && maxExtendedFunction() >= 0x80000006 { + _, _, ecx, _ := cpuid(0x80000006) + cache = ecx & 0xff // cacheline size + } + // TODO: Read from Cache and TLB Information + return int(cache) +} + +func (c *cpuInfo) cacheSize() { + c.cache.l1d = -1 + c.cache.l1i = -1 + c.cache.l2 = -1 + c.cache.l3 = -1 + vendor := vendorID() + switch vendor { + case intel: + if maxFunctionID() < 4 { + return + } + for i := uint32(0); ; i++ { + eax, ebx, ecx, _ := cpuidex(4, i) + cacheType := eax & 15 + if cacheType == 0 { + break + } + cacheLevel := (eax >> 5) & 7 + coherency := int(ebx&0xfff) + 1 + partitions := int((ebx>>12)&0x3ff) + 1 + associativity := int((ebx>>22)&0x3ff) + 1 + sets := int(ecx) + 1 + size := associativity * partitions * coherency * sets + switch cacheLevel { + case 1: + if cacheType == 1 { + // 1 = Data Cache + c.cache.l1d = size + } else if cacheType == 2 { + // 2 = Instruction Cache + c.cache.l1i = size + } else { + if c.cache.l1d < 0 { + c.cache.l1i = size + } + if c.cache.l1i < 0 { + c.cache.l1i = size + } + } + case 2: + c.cache.l2 = size + case 3: + c.cache.l3 = size + } + } + case amd: + // Untested. + if maxExtendedFunction() < 0x80000005 { + return + } + _, _, ecx, edx := cpuid(0x80000005) + c.cache.l1d = int(((ecx >> 24) & 0xFF) * 1024) + c.cache.l1i = int(((edx >> 24) & 0xFF) * 1024) + + if maxExtendedFunction() < 0x80000006 { + return + } + _, _, ecx, _ = cpuid(0x80000006) + c.cache.l2 = int(((ecx >> 16) & 0xFFFF) * 1024) + } + + return +} + +type sgxsupport struct { + available bool + sgx1supported bool + sgx2supported bool + maxenclavesizenot64 int64 + maxenclavesize64 int64 +} + +func hasSGX(available bool) (rval sgxsupport) { + rval.available = available + + if !available { + return + } + + a, _, _, d := cpuidex(0x12, 0) + rval.sgx1supported = a&0x01 != 0 + rval.sgx2supported = a&0x02 != 0 + rval.maxenclavesizenot64 = 1 << (d & 0xFF) // pow 2 + rval.maxenclavesize64 = 1 << ((d >> 8) & 0xFF) // pow 2 + + return +} + +func support() flags { + mfi := maxFunctionID() + vend := vendorID() + if mfi < 0x1 { + return 0 + } + rval := uint64(0) + _, _, c, d := cpuid(1) + if (d & (1 << 15)) != 0 { + rval |= cmov + } + if (d & (1 << 23)) != 0 { + rval |= mmx + } + if (d & (1 << 25)) != 0 { + rval |= mmxext + } + if (d & (1 << 25)) != 0 { + rval |= sse + } + if (d & (1 << 26)) != 0 { + rval |= sse2 + } + if (c & 1) != 0 { + rval |= sse3 + } + if (c & 0x00000200) != 0 { + rval |= ssse3 + } + if (c & 0x00080000) != 0 { + rval |= sse4 + } + if (c & 0x00100000) != 0 { + rval |= sse42 + } + if (c & (1 << 25)) != 0 { + rval |= aesni + } + if (c & (1 << 1)) != 0 { + rval |= clmul + } + if c&(1<<23) != 0 { + rval |= popcnt + } + if c&(1<<30) != 0 { + rval |= rdrand + } + if c&(1<<29) != 0 { + rval |= f16c + } + if c&(1<<13) != 0 { + rval |= cx16 + } + if vend == intel && (d&(1<<28)) != 0 && mfi >= 4 { + if threadsPerCore() > 1 { + rval |= htt + } + } + + // Check XGETBV, OXSAVE and AVX bits + if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 { + // Check for OS support + eax, _ := xgetbv(0) + if (eax & 0x6) == 0x6 { + rval |= avx + if (c & 0x00001000) != 0 { + rval |= fma3 + } + } + } + + // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. + if mfi >= 7 { + _, ebx, ecx, _ := cpuidex(7, 0) + if (rval&avx) != 0 && (ebx&0x00000020) != 0 { + rval |= avx2 + } + if (ebx & 0x00000008) != 0 { + rval |= bmi1 + if (ebx & 0x00000100) != 0 { + rval |= bmi2 + } + } + if ebx&(1<<2) != 0 { + rval |= sgx + } + if ebx&(1<<4) != 0 { + rval |= hle + } + if ebx&(1<<9) != 0 { + rval |= erms + } + if ebx&(1<<11) != 0 { + rval |= rtm + } + if ebx&(1<<14) != 0 { + rval |= mpx + } + if ebx&(1<<18) != 0 { + rval |= rdseed + } + if ebx&(1<<19) != 0 { + rval |= adx + } + if ebx&(1<<29) != 0 { + rval |= sha + } + + // Only detect AVX-512 features if XGETBV is supported + if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { + // Check for OS support + eax, _ := xgetbv(0) + + // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and + // ZMM16-ZMM31 state are enabled by OS) + /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS). + if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 { + if ebx&(1<<16) != 0 { + rval |= avx512f + } + if ebx&(1<<17) != 0 { + rval |= avx512dq + } + if ebx&(1<<21) != 0 { + rval |= avx512ifma + } + if ebx&(1<<26) != 0 { + rval |= avx512pf + } + if ebx&(1<<27) != 0 { + rval |= avx512er + } + if ebx&(1<<28) != 0 { + rval |= avx512cd + } + if ebx&(1<<30) != 0 { + rval |= avx512bw + } + if ebx&(1<<31) != 0 { + rval |= avx512vl + } + // ecx + if ecx&(1<<1) != 0 { + rval |= avx512vbmi + } + } + } + } + + if maxExtendedFunction() >= 0x80000001 { + _, _, c, d := cpuid(0x80000001) + if (c & (1 << 5)) != 0 { + rval |= lzcnt + rval |= popcnt + } + if (d & (1 << 31)) != 0 { + rval |= amd3dnow + } + if (d & (1 << 30)) != 0 { + rval |= amd3dnowext + } + if (d & (1 << 23)) != 0 { + rval |= mmx + } + if (d & (1 << 22)) != 0 { + rval |= mmxext + } + if (c & (1 << 6)) != 0 { + rval |= sse4a + } + if d&(1<<20) != 0 { + rval |= nx + } + if d&(1<<27) != 0 { + rval |= rdtscp + } + + /* Allow for selectively disabling SSE2 functions on AMD processors + with SSE2 support but not SSE4a. This includes Athlon64, some + Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster + than SSE2 often enough to utilize this special-case flag. + AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case + so that SSE2 is used unless explicitly disabled by checking + AV_CPU_FLAG_SSE2SLOW. */ + if vendorID() != intel && + rval&sse2 != 0 && (c&0x00000040) == 0 { + rval |= sse2slow + } + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if (rval & avx) != 0 { + if (c & 0x00000800) != 0 { + rval |= xop + } + if (c & 0x00010000) != 0 { + rval |= fma4 + } + } + + if vendorID() == intel { + family, model := familyModel() + if family == 6 && (model == 9 || model == 13 || model == 14) { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and + * 6/14 (core1 "yonah") theoretically support sse2, but it's + * usually slower than mmx. */ + if (rval & sse2) != 0 { + rval |= sse2slow + } + if (rval & sse3) != 0 { + rval |= sse3slow + } + } + /* The Atom processor has SSSE3 support, which is useful in many cases, + * but sometimes the SSSE3 version is slower than the SSE2 equivalent + * on the Atom, but is generally faster on other processors supporting + * SSSE3. This flag allows for selectively disabling certain SSSE3 + * functions on the Atom. */ + if family == 6 && model == 28 { + rval |= atom + } + } + } + return flags(rval) +} + +func valAsString(values ...uint32) []byte { + r := make([]byte, 4*len(values)) + for i, v := range values { + dst := r[i*4:] + dst[0] = byte(v & 0xff) + dst[1] = byte((v >> 8) & 0xff) + dst[2] = byte((v >> 16) & 0xff) + dst[3] = byte((v >> 24) & 0xff) + switch { + case dst[0] == 0: + return r[:i*4] + case dst[1] == 0: + return r[:i*4+1] + case dst[2] == 0: + return r[:i*4+2] + case dst[3] == 0: + return r[:i*4+3] + } + } + return r +} diff --git a/vendor/github.com/klauspost/cpuid/private/cpuid_386.s b/vendor/github.com/klauspost/cpuid/private/cpuid_386.s new file mode 100644 index 00000000..4d731711 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/private/cpuid_386.s @@ -0,0 +1,42 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build 386,!gccgo + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORL CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+4(FP) + MOVL BX, ebx+8(FP) + MOVL CX, ecx+12(FP) + MOVL DX, edx+16(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+4(FP) + MOVL DX, edx+8(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET diff --git a/vendor/github.com/klauspost/cpuid/private/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/private/cpuid_amd64.s new file mode 100644 index 00000000..3c1d60e4 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/private/cpuid_amd64.s @@ -0,0 +1,42 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +//+build amd64,!gccgo + +// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuid(SB), 7, $0 + XORQ CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +TEXT ·asmCpuidex(SB), 7, $0 + MOVL op+0(FP), AX + MOVL op2+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func asmXgetbv(index uint32) (eax, edx uint32) +TEXT ·asmXgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+8(FP) + MOVL DX, edx+12(FP) + RET + +// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) +TEXT ·asmRdtscpAsm(SB), 7, $0 + BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP + MOVL AX, eax+0(FP) + MOVL BX, ebx+4(FP) + MOVL CX, ecx+8(FP) + MOVL DX, edx+12(FP) + RET diff --git a/vendor/github.com/klauspost/cpuid/private/cpuid_detect_intel.go b/vendor/github.com/klauspost/cpuid/private/cpuid_detect_intel.go new file mode 100644 index 00000000..a5f04dd6 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/private/cpuid_detect_intel.go @@ -0,0 +1,17 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build 386,!gccgo amd64,!gccgo + +package cpuid + +func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32) +func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) +func asmXgetbv(index uint32) (eax, edx uint32) +func asmRdtscpAsm() (eax, ebx, ecx, edx uint32) + +func initCPU() { + cpuid = asmCpuid + cpuidex = asmCpuidex + xgetbv = asmXgetbv + rdtscpAsm = asmRdtscpAsm +} diff --git a/vendor/github.com/klauspost/cpuid/private/cpuid_detect_ref.go b/vendor/github.com/klauspost/cpuid/private/cpuid_detect_ref.go new file mode 100644 index 00000000..909c5d9a --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/private/cpuid_detect_ref.go @@ -0,0 +1,23 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build !amd64,!386 gccgo + +package cpuid + +func initCPU() { + cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 + } + + cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 + } + + xgetbv = func(index uint32) (eax, edx uint32) { + return 0, 0 + } + + rdtscpAsm = func() (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 + } +} diff --git a/vendor/manifest b/vendor/manifest index a44a451e..b8d13653 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -117,6 +117,14 @@ "path": "/basic", "notests": true }, + { + "importpath": "github.com/klauspost/cpuid", + "repository": "https://github.com/klauspost/cpuid", + "vcs": "git", + "revision": "ae832f27941af41db13bd6d8efd2493e3b22415a", + "branch": "master", + "notests": true + }, { "importpath": "github.com/lucas-clemente/aes12", "repository": "https://github.com/lucas-clemente/aes12",