/* Copyright (C) 2018 Wildfire Games. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* * detection of CPU and cache topology */ #include "precompiled.h" #include "lib/sysdep/arch/x86_x64/topology.h" #include #include "lib/bits.h" #include "lib/module_init.h" #include "lib/sysdep/cpu.h" // ERR::CPU_FEATURE_MISSING #include "lib/sysdep/os_cpu.h" #include "lib/sysdep/numa.h" #include "lib/sysdep/arch/x86_x64/x86_x64.h" #include "lib/sysdep/arch/x86_x64/cache.h" #include "lib/sysdep/arch/x86_x64/apic.h" namespace topology { //--------------------------------------------------------------------------------------------------------------------- // detect *maximum* number of cores/packages/caches. // note: some of them may be disabled by the OS or BIOS. // note: Intel Appnote 485 assures us that they are uniform across packages. static size_t MaxCoresPerPackage() { // assume single-core unless one of the following applies: size_t maxCoresPerPackage = 1; x86_x64::CpuidRegs regs = { 0 }; switch(x86_x64::Vendor()) { case x86_x64::VENDOR_INTEL: regs.eax = 0xB; regs.ebx = 0; regs.ecx = 1; regs.edx = 0; // if 0xB supported if(x86_x64::cpuid(®s)){ printf("core B output regs: eax:%08x ebx:%08x ecx:%08x edx:%08x\n", regs.eax,regs.ebx,regs.ecx,regs.edx); printf("core B ebx 0,15:%08x\n",bits(regs.ebx,0,15)); maxCoresPerPackage=bits(regs.ebx,0,15); break; } regs.eax = 4; regs.ebx = 0; regs.ecx = 0; regs.edx = 0; if(x86_x64::cpuid(®s)){ printf("core 4 output regs: eax:%08x ebx:%08x ecx:%08x edx:%08x\n", regs.eax,regs.ebx,regs.ecx,regs.edx); printf("core 4 eax 26,31 +1:%08x\n",bits(regs.eax,26,31)+1); maxCoresPerPackage = bits(regs.eax, 26, 31)+1; } break; case x86_x64::VENDOR_AMD: regs.eax = 0x80000008; if(x86_x64::cpuid(®s)) maxCoresPerPackage = bits(regs.ecx, 0, 7)+1; break; default: break; } if (maxCoresPerPackage>0) return maxCoresPerPackage; return 1; } static size_t MaxLogicalPerCore() { size_t logicalPerPackage = 0; size_t maxLogicalPerCore = 0; const size_t maxCoresPerPackage = MaxCoresPerPackage(); struct IsHyperthreadingCapable { bool operator()() const { // definitely not if(!x86_x64::Cap(x86_x64::CAP_HT)) return false; // multi-core AMD systems falsely set the HT bit for reasons of // compatibility. we'll just ignore it, because clearing it might // confuse other callers. if(x86_x64::Vendor() == x86_x64::VENDOR_AMD && x86_x64::Cap(x86_x64::CAP_AMD_CMP_LEGACY)) return false; return true; } }; if(IsHyperthreadingCapable()()) { x86_x64::CpuidRegs regs = { 0 }; if(x86_x64::Vendor() == x86_x64::VENDOR_INTEL){ regs.eax = 0xB; regs.ebx = 0; regs.ecx = 0; regs.edx = 0; // if 0xB supported if(x86_x64::cpuid(®s)) { printf("log B output regs: eax:%08x ebx:%08x ecx:%08x edx:%08x\n", regs.eax,regs.ebx,regs.ecx,regs.edx); printf("log B ebx 0,15:%08x\n",bits(regs.ebx,0,15)); maxLogicalPerCore = bits(regs.ebx,0,15); logicalPerPackage = maxCoresPerPackage * maxLogicalPerCore ; printf("log B maxCoresPerPackage:%i\n",maxCoresPerPackage); printf("log B maxLogicalPerCore:%i\n",maxLogicalPerCore); printf("log B logicalPerPackage:%i\n",logicalPerPackage ); if( (logicalPerPackage % maxCoresPerPackage) == 0) return maxLogicalPerCore; } } regs.eax = 1; regs.ebx = 0; regs.ecx = 0; regs.edx = 0; // if supported if(x86_x64::cpuid(®s)) { printf("log 1 output regs: eax:%08x ebx:%08x ecx:%08x edx:%08x\n", regs.eax,regs.ebx,regs.ecx,regs.edx); printf("log 1 ebx 16,23:%08x\n",bits(regs.ebx,16,23)); logicalPerPackage = bits(regs.ebx, 16, 23); maxLogicalPerCore= logicalPerPackage /maxCoresPerPackage; printf("log 1 maxCoresPerPackage:%i\n",maxCoresPerPackage); printf("log 1 maxLogicalPerCore:%i\n",maxLogicalPerCore); printf("log 1 logicalPerPackage:%i\n",logicalPerPackage ); if( (logicalPerPackage % maxCoresPerPackage) == 0) return maxLogicalPerCore; } else { //DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING); } return 1; }else{ return 1; } } static size_t MaxLogicalPerCache() { return x86_x64::Caches(x86_x64::L2D)->m_SharedBy; } //--------------------------------------------------------------------------------------------------------------------- // CPU topology interface // APIC IDs consist of variable-length bit fields indicating the logical, // core, package and cache IDs. Vol3a says they aren't guaranteed to be // contiguous, but that also applies to the individual fields. // for example, quad-core E5630 CPUs report 4-bit core IDs 0, 1, 6, 7. struct ApicField // POD { size_t operator()(size_t bits) const { return (bits >> shift) & mask; } size_t mask; // zero for zero-width fields size_t shift; }; struct CpuTopology // POD { size_t numProcessors; // total reported by OS ApicField logical; ApicField core; ApicField package; // how many are actually enabled size_t logicalPerCore; size_t coresPerPackage; size_t numPackages; }; static CpuTopology cpuTopology; static ModuleInitState cpuInitState; static Status InitCpuTopology() { cpuTopology.numProcessors = os_cpu_NumProcessors(); const size_t maxLogicalPerCore = MaxLogicalPerCore(); const size_t maxCoresPerPackage = MaxCoresPerPackage(); const size_t maxPackages = 256; // "enough" const size_t logicalWidth = ceil_log2(maxLogicalPerCore); const size_t coreWidth = ceil_log2(maxCoresPerPackage); const size_t packageWidth = ceil_log2(maxPackages); cpuTopology.logical.mask = bit_mask(logicalWidth); cpuTopology.core.mask = bit_mask(coreWidth); cpuTopology.package.mask = bit_mask(packageWidth); cpuTopology.logical.shift = 0; cpuTopology.core.shift = logicalWidth; cpuTopology.package.shift = logicalWidth + coreWidth; if(AreApicIdsReliable()) { struct NumUniqueValuesInField { size_t operator()(const ApicField& apicField) const { std::bitset values; for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++) { const ApicId apicId = ApicIdFromProcessor(processor); const size_t value = apicField(apicId); values.set(value); } return values.count(); } }; cpuTopology.logicalPerCore = NumUniqueValuesInField()(cpuTopology.logical); cpuTopology.coresPerPackage = NumUniqueValuesInField()(cpuTopology.core); cpuTopology.numPackages = NumUniqueValuesInField()(cpuTopology.package); } else // processor lacks an xAPIC, or IDs are invalid { struct MinPackages { size_t operator()(size_t maxCoresPerPackage, size_t maxLogicalPerCore) const { const size_t numNodes = numa_NumNodes(); const size_t logicalPerNode = PopulationCount(numa_ProcessorMaskFromNode(0)); // NB: some cores or logical processors may be disabled. const size_t maxLogicalPerPackage = maxCoresPerPackage*maxLogicalPerCore; const size_t minPackagesPerNode = DivideRoundUp(logicalPerNode, maxLogicalPerPackage); return minPackagesPerNode*numNodes; } }; // we can't differentiate between cores and logical processors. // since the former are less likely to be disabled, we seek the // maximum feasible number of cores and minimal number of packages: const size_t minPackages = MinPackages()(maxCoresPerPackage, maxLogicalPerCore); for(size_t numPackages = minPackages; numPackages <= cpuTopology.numProcessors; numPackages++) { if(cpuTopology.numProcessors % numPackages != 0) continue; const size_t logicalPerPackage = cpuTopology.numProcessors / numPackages; const size_t minCoresPerPackage = DivideRoundUp(logicalPerPackage, maxLogicalPerCore); for(size_t coresPerPackage = maxCoresPerPackage; coresPerPackage >= minCoresPerPackage; coresPerPackage--) { if(logicalPerPackage % coresPerPackage != 0) continue; const size_t logicalPerCore = logicalPerPackage / coresPerPackage; if(logicalPerCore <= maxLogicalPerCore) { ENSURE(cpuTopology.numProcessors == numPackages*coresPerPackage*logicalPerCore); cpuTopology.logicalPerCore = logicalPerCore; cpuTopology.coresPerPackage = coresPerPackage; cpuTopology.numPackages = numPackages; return INFO::OK; } } } DEBUG_WARN_ERR(ERR::LOGIC); // didn't find a feasible topology } return INFO::OK; } size_t NumPackages() { ModuleInit(&cpuInitState, InitCpuTopology); return cpuTopology.numPackages; } size_t CoresPerPackage() { ModuleInit(&cpuInitState, InitCpuTopology); return cpuTopology.coresPerPackage; } size_t LogicalPerCore() { ModuleInit(&cpuInitState, InitCpuTopology); return cpuTopology.logicalPerCore; } size_t LogicalFromApicId(ApicId apicId) { const size_t contiguousId = ContiguousIdFromApicId(apicId); return contiguousId % cpuTopology.logicalPerCore; } size_t CoreFromApicId(ApicId apicId) { const size_t contiguousId = ContiguousIdFromApicId(apicId); return (contiguousId / cpuTopology.logicalPerCore) % cpuTopology.coresPerPackage; } size_t PackageFromApicId(ApicId apicId) { const size_t contiguousId = ContiguousIdFromApicId(apicId); return contiguousId / (cpuTopology.logicalPerCore * cpuTopology.coresPerPackage); } ApicId ApicIdFromIndices(size_t idxLogical, size_t idxCore, size_t idxPackage) { ModuleInit(&cpuInitState, InitCpuTopology); size_t contiguousId = 0; ENSURE(idxPackage < cpuTopology.numPackages); contiguousId += idxPackage; contiguousId *= cpuTopology.coresPerPackage; ENSURE(idxCore < cpuTopology.coresPerPackage); contiguousId += idxCore; contiguousId *= cpuTopology.logicalPerCore; ENSURE(idxLogical < cpuTopology.logicalPerCore); contiguousId += idxLogical; ENSURE(contiguousId < cpuTopology.numProcessors); return ApicIdFromContiguousId(contiguousId); } //--------------------------------------------------------------------------------------------------------------------- // cache topology // note: Windows 2003 GetLogicalProcessorInformation provides similar // functionality but returns incorrect results. (it claims all cores in // an Intel Core2 Quad processor share a single L2 cache.) class CacheRelations { public: /** * add processor to the processor mask owned by cache identified by \ **/ void Add(u8 cacheId, size_t processor) { SharedCache* cache = Find(cacheId); if(!cache) { m_caches.push_back(cacheId); cache = &m_caches.back(); } cache->Add(processor); } size_t NumCaches() const { return m_caches.size(); } /** * store topology in an array (one entry per cache) of masks * representing the processors that share a cache. **/ void StoreProcessorMasks(uintptr_t* cachesProcessorMask) { for(size_t i = 0; i < NumCaches(); i++) cachesProcessorMask[i] = m_caches[i].ProcessorMask(); } private: /** * stores ID and tracks which processors share this cache **/ class SharedCache { public: SharedCache(u8 cacheId) : m_cacheId(cacheId), m_processorMask(0) { } bool Matches(u8 cacheId) const { return m_cacheId == cacheId; } void Add(size_t processor) { m_processorMask |= uintptr_t(1) << processor; } uintptr_t ProcessorMask() const { return m_processorMask; } private: u8 m_cacheId; uintptr_t m_processorMask; }; SharedCache* Find(u8 cacheId) { for(size_t i = 0; i < m_caches.size(); i++) { if(m_caches[i].Matches(cacheId)) return &m_caches[i]; } return 0; } std::vector m_caches; }; static void DetermineCachesProcessorMask(uintptr_t* cachesProcessorMask, size_t& numCaches) { CacheRelations cacheRelations; if(AreApicIdsReliable()) { const size_t numBits = ceil_log2(MaxLogicalPerCache()); const u8 cacheIdMask = u8((0xFF << numBits) & 0xFF); for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++) { const ApicId apicId = ApicIdFromProcessor(processor); const u8 cacheId = u8(apicId & cacheIdMask); cacheRelations.Add(cacheId, processor); } } else { for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++) { // assume each processor has exactly one cache with matching IDs const u8 cacheId = (u8)processor; cacheRelations.Add(cacheId, processor); } } numCaches = cacheRelations.NumCaches(); cacheRelations.StoreProcessorMasks(cachesProcessorMask); } static void DetermineProcessorsCache(const uintptr_t* cachesProcessorMask, size_t numCaches, size_t* processorsCache, size_t numProcessors) { for(size_t cache = 0; cache < numCaches; cache++) { // write to all entries that share this cache const uintptr_t processorMask = cachesProcessorMask[cache]; for(size_t processor = 0; processor < numProcessors; processor++) { if(IsBitSet(processorMask, processor)) { ENSURE(processorsCache[processor] == 0); processorsCache[processor] = cache; } } } } //--------------------------------------------------------------------------------------------------------------------- // cache topology interface struct CacheTopology // POD { size_t numCaches; size_t processorsCache[os_cpu_MaxProcessors]; uintptr_t cachesProcessorMask[os_cpu_MaxProcessors]; }; static CacheTopology cacheTopology; static ModuleInitState cacheInitState; static Status InitCacheTopology() { ModuleInit(&cpuInitState, InitCpuTopology); DetermineCachesProcessorMask(cacheTopology.cachesProcessorMask, cacheTopology.numCaches); DetermineProcessorsCache(cacheTopology.cachesProcessorMask, cacheTopology.numCaches, cacheTopology.processorsCache, os_cpu_NumProcessors()); return INFO::OK; } size_t NumCaches() { ModuleInit(&cacheInitState, InitCacheTopology); return cacheTopology.numCaches; } size_t CacheFromProcessor(size_t processor) { ModuleInit(&cacheInitState, InitCacheTopology); ENSURE(processor < os_cpu_NumProcessors()); return cacheTopology.processorsCache[processor]; } uintptr_t ProcessorMaskFromCache(size_t cache) { ModuleInit(&cacheInitState, InitCacheTopology); ENSURE(cache < cacheTopology.numCaches); return cacheTopology.cachesProcessorMask[cache]; } } // namespace topology