diff --git a/src/cpu.cpp b/src/cpu.cpp index a095b6b6f5c0..b7ebbba57b6f 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -875,9 +875,50 @@ static int get_cpucount() else count = 1; #elif defined _WIN32 - SYSTEM_INFO system_info; - GetSystemInfo(&system_info); - count = system_info.dwNumberOfProcessors; + typedef BOOL(WINAPI * LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD); + LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx"); + if (glpiex != NULL) + { + DWORD length = 0; + glpiex(RelationAll, NULL, &length); + + if (length > 0) + { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(length); + + if (buffer && glpiex(RelationAll, buffer, &length)) + { + count = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = buffer; + DWORD offset = 0; + + while (offset < length) + { + if (ptr->Relationship == RelationProcessorCore) + { + for (WORD i = 0; i < ptr->Processor.GroupCount; i++) + { + count += _popcnt64(ptr->Processor.GroupMask[i].Mask); + } + } + offset += ptr->Size; + ptr = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)ptr + ptr->Size); + } + } + + if (buffer) + { + free(buffer); + } + } + } + //If cpu's count <= 64, use the previouse version. + if (count == 0) + { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + count = system_info.dwNumberOfProcessors; + } #elif defined __ANDROID__ || defined __linux__ // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); @@ -1352,6 +1393,57 @@ static ncnn::CpuSet get_smt_cpu_mask() { ncnn::CpuSet smt_cpu_mask; + typedef BOOL(WINAPI * LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD); + LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx"); + if (glpiex != NULL) //CPU core > 64 + { + DWORD length = 0; + glpiex(RelationProcessorCore, NULL, &length); + + if (length > 0) + { + std::vector buffer(length); + if (glpiex(RelationProcessorCore, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(), &length)) + { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(); + + while ((char*)current < buffer.data() + length) + { + if (current->Relationship == RelationProcessorCore) + { + int total_logical_count = 0; + for (WORD group = 0; group < current->Processor.GroupCount; group++) + { + total_logical_count += _popcnt64(current->Processor.GroupMask[group].Mask); + } + + if (total_logical_count > 1) + { + for (WORD group = 0; group < current->Processor.GroupCount; group++) + { + KAFFINITY mask = current->Processor.GroupMask[group].Mask; + for (int cpu = 0; cpu < 64 && mask; cpu++) + { + if (mask & (1ULL << cpu)) + { + int global_cpu = group * 64 + cpu; + smt_cpu_mask.enable(global_cpu); + mask &= ~(1ULL << cpu); + } + } + } + } + } + + current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)current + current->Size); + } + + return smt_cpu_mask; + } + } + } + + // Under 64, use the old API typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi == NULL) @@ -1372,12 +1464,18 @@ static ncnn::CpuSet get_smt_cpu_mask() { if (ptr->Relationship == RelationProcessorCore) { - ncnn::CpuSet smt_set; - smt_set.mask = ptr->ProcessorMask; - if (smt_set.num_enabled() > 1) + int logical_count = _popcnt64(ptr->ProcessorMask); + if (logical_count > 1) { - // this core is smt - smt_cpu_mask.mask |= smt_set.mask; + ULONG_PTR mask = ptr->ProcessorMask; + for (int cpu = 0; cpu < 64 && mask; cpu++) + { + if (mask & (1ULL << cpu)) + { + smt_cpu_mask.enable(cpu); + mask &= ~(1ULL << cpu); + } + } } } @@ -1386,7 +1484,6 @@ static ncnn::CpuSet get_smt_cpu_mask() } free(buffer); - return smt_cpu_mask; } @@ -1432,13 +1529,25 @@ static std::vector get_max_freq_mhz() static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) { - DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask); - if (prev_mask == 0) + for (int group = 0; group < thread_affinity_mask.active_groups; group++) { - NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); - return -1; + if (thread_affinity_mask.masks[group] != 0) + { + GROUP_AFFINITY groupAffinity; + groupAffinity.Mask = thread_affinity_mask.masks[group]; + groupAffinity.Group = (WORD)group; + groupAffinity.Reserved[0] = 0; + groupAffinity.Reserved[1] = 0; + groupAffinity.Reserved[2] = 0; + + if (!SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL)) + { + NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError()); + return -1; + } + break; + } } - return 0; } #endif // defined _WIN32 @@ -2273,34 +2382,82 @@ CpuSet::CpuSet() void CpuSet::enable(int cpu) { - mask |= ((ULONG_PTR)1 << cpu); + if (cpu < 0 || cpu >= max_cpus) return; + + int group = cpu / 64; + int bit = cpu % 64; + + if (group < MAX_CPU_GROUPS) + { + masks[group] |= (1ULL << bit); + } } void CpuSet::disable(int cpu) { - mask &= ~((ULONG_PTR)1 << cpu); + if (cpu < 0 || cpu >= max_cpus) return; + + int group = cpu / 64; + int bit = cpu % 64; + + if (group < MAX_CPU_GROUPS) + { + masks[group] &= ~(1ULL << bit); + } } void CpuSet::disable_all() { - mask = 0; + for (int i = 0; i < MAX_CPU_GROUPS; i++) + { + masks[i] = 0; + } } bool CpuSet::is_enabled(int cpu) const { - return mask & ((ULONG_PTR)1 << cpu); + if (cpu < 0 || cpu >= max_cpus) return false; + + int group = cpu / 64; + int bit = cpu % 64; + + if (group < MAX_CPU_GROUPS) + { + return (masks[group] & (1ULL << bit)) != 0; + } + return false; } int CpuSet::num_enabled() const { - int num_enabled = 0; - for (int i = 0; i < (int)sizeof(mask) * 8; i++) + int count = 0; + for (int i = 0; i < MAX_CPU_GROUPS; i++) { - if (is_enabled(i)) - num_enabled++; + count += __builtin_popcountll(masks[i]); } + return count; +} - return num_enabled; +ULONG_PTR CpuSet::get_group_mask(int group) const +{ + if (group < 0 || group >= MAX_CPU_GROUPS) + { + return 0; + } + return masks[group]; +} + +int CpuSet::get_active_group_count() const +{ + int count = 0; + for (int i = 0; i < MAX_CPU_GROUPS; i++) + { + if (masks[i] != 0) + { + count++; + } + } + return count; } #elif defined __ANDROID__ || defined __linux__ CpuSet::CpuSet() diff --git a/src/cpu.h b/src/cpu.h index cbf417111f6d..1717904b2f32 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -28,9 +28,21 @@ class NCNN_EXPORT CpuSet bool is_enabled(int cpu) const; int num_enabled() const; +#if defined _WIN32 + int get_max_cpus() const + { + return max_cpus; + } + ULONG_PTR get_group_mask(int group) const; + int get_active_group_count() const; +#endif + public: #if defined _WIN32 - ULONG_PTR mask; + static const int MAX_CPU_GROUPS = 20; + ULONG_PTR masks[MAX_CPU_GROUPS]; + int max_cpus; + int active_groups; #endif #if defined __ANDROID__ || defined __linux__ cpu_set_t cpu_set; @@ -118,7 +130,6 @@ NCNN_EXPORT int cpu_support_riscv_xtheadvector(); // vlenb = riscv vector length in bytes NCNN_EXPORT int cpu_riscv_vlenb(); -// cpu info NCNN_EXPORT int get_cpu_count(); NCNN_EXPORT int get_little_cpu_count(); NCNN_EXPORT int get_big_cpu_count(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5a0940e88c6b..fa7a4d564413 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -166,3 +166,7 @@ ncnn_add_layer_test(Tile) ncnn_add_layer_test(UnaryOp) ncnn_add_layer_test(Unfold) ncnn_add_layer_test(Yolov3DetectionOutput) + +if(WIN32) + ncnn_add_test(ncnn_cpu_cores) +endif() diff --git a/tests/test_ncnn_cpu_cores.cpp b/tests/test_ncnn_cpu_cores.cpp new file mode 100644 index 000000000000..6577dffb5416 --- /dev/null +++ b/tests/test_ncnn_cpu_cores.cpp @@ -0,0 +1,297 @@ +#include +#include +#include +#include +#include +#include "cpu.h" + +static void print_separator(const char* title) +{ + printf("\n=== %s ===\n", title); +} + +static int test_basic_cpu_info() +{ + print_separator("Basic CPU Information Test"); + + int cpu_count = ncnn::get_cpu_count(); + int big_cpu_count = ncnn::get_big_cpu_count(); + int little_cpu_count = ncnn::get_little_cpu_count(); + int physical_cpu_count = ncnn::get_physical_cpu_count(); + + printf("CPU Count: %d\n", cpu_count); + printf("Big CPU Count: %d\n", big_cpu_count); + printf("Little CPU Count: %d\n", little_cpu_count); + printf("Physical CPU Count: %d\n", physical_cpu_count); + + if (cpu_count <= 0) + { + printf("ERROR: Invalid CPU count\n"); + return -1; + } + + return 0; +} + +static int test_windows_api_comparison() +{ + print_separator("Windows API Comparison Test"); + + // Get ncnn detected CPU count + int ncnn_cpu_count = ncnn::get_cpu_count(); + + // Get Windows API CPU count + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + DWORD win_cpu_count = sysinfo.dwNumberOfProcessors; + + printf("NCNN detected CPUs: %d\n", ncnn_cpu_count); + printf("Windows GetSystemInfo CPUs: %d\n", win_cpu_count); + + // Test GetLogicalProcessorInformationEx for >64 core support + DWORD buffer_size = 0; + GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size); + + if (buffer_size > 0) + { + std::vector buffer(buffer_size); + if (GetLogicalProcessorInformationEx(RelationProcessorCore, + (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(), &buffer_size)) + { + int core_count = 0; + int group_count = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(); + + while ((char*)current < buffer.data() + buffer_size) + { + if (current->Relationship == RelationProcessorCore) + { + core_count++; + group_count = max(group_count, (int)current->Processor.GroupCount); + } + current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)current + current->Size); + } + + printf("GetLogicalProcessorInformationEx cores: %d\n", core_count); + printf("Processor groups detected: %d\n", group_count); + + if (core_count > 64) + { + printf("SUCCESS: Detected >64 core system\n"); + } + } + } + + return 0; +} + +static int test_cpuset_basic_operations() +{ + print_separator("CpuSet Basic Operations Test"); + + ncnn::CpuSet cpuset; + + // Test initial state + int initial_enabled = cpuset.num_enabled(); + printf("Initial enabled CPUs: %d\n", initial_enabled); + + // Test enabling specific CPUs + int cpu_count = ncnn::get_cpu_count(); + for (int i = 0; i < min(cpu_count, 8); i++) + { + cpuset.enable(i); + if (!cpuset.is_enabled(i)) + { + printf("ERROR: Failed to enable CPU %d\n", i); + return -1; + } + } + + printf("Enabled first 8 CPUs, total enabled: %d\n", cpuset.num_enabled()); + + // Test disabling + cpuset.disable(0); + if (cpuset.is_enabled(0)) + { + printf("ERROR: Failed to disable CPU 0\n"); + return -1; + } + + printf("Disabled CPU 0, total enabled: %d\n", cpuset.num_enabled()); + + // Test disable_all + cpuset.disable_all(); + if (cpuset.num_enabled() != 0) + { + printf("ERROR: disable_all failed\n"); + return -1; + } + + printf("After disable_all, enabled CPUs: %d\n", cpuset.num_enabled()); + + return 0; +} + +static int test_cpuset_large_core_numbers() +{ + print_separator("CpuSet Large Core Numbers Test"); + + ncnn::CpuSet cpuset; + int cpu_count = ncnn::get_cpu_count(); + + // Test enabling all available CPUs + for (int i = 0; i < cpu_count; i++) + { + cpuset.enable(i); + } + + int enabled_count = cpuset.num_enabled(); + printf("Enabled all %d CPUs, actual enabled: %d\n", cpu_count, enabled_count); + + if (enabled_count != cpu_count) + { + printf("WARNING: Mismatch between expected and actual enabled CPUs\n"); + } + + // Test boundary conditions + if (cpu_count > 64) + { + printf("Testing >64 core boundary...\n"); + + cpuset.disable_all(); + + // Enable CPUs around the 64-core boundary + for (int i = 60; i < min(cpu_count, 68); i++) + { + cpuset.enable(i); + if (!cpuset.is_enabled(i)) + { + printf("ERROR: Failed to enable CPU %d (around 64-core boundary)\n", i); + return -1; + } + } + + printf("Successfully enabled CPUs around 64-core boundary\n"); + } + + return 0; +} + +#ifdef _WIN32 +static int test_windows_specific_features() +{ + print_separator("Windows Specific Features Test"); + + ncnn::CpuSet cpuset; + + // Test Windows-specific methods + int max_cpus = cpuset.get_max_cpus(); + int active_groups = cpuset.get_active_group_count(); + + printf("Max CPUs: %d\n", max_cpus); + printf("Active processor groups: %d\n", active_groups); + + // Test group masks + for (int group = 0; group < active_groups && group < 4; group++) + { + ULONG_PTR mask = cpuset.get_group_mask(group); + printf("Group %d mask: 0x%llx\n", group, (unsigned long long)mask); + } + + // Test enabling CPUs in different groups + if (active_groups > 1) + { + printf("Testing multi-group CPU enabling...\n"); + + cpuset.disable_all(); + + // Enable some CPUs in group 0 + for (int i = 0; i < min(4, max_cpus); i++) + { + cpuset.enable(i); + } + + // Enable some CPUs in group 1 (if exists) + if (max_cpus > 64) + { + for (int i = 64; i < min(68, max_cpus); i++) + { + cpuset.enable(i); + } + } + + printf("Multi-group test completed, enabled CPUs: %d\n", cpuset.num_enabled()); + } + + return 0; +} +#endif + +static int test_thread_affinity() +{ + print_separator("Thread Affinity Test"); + + // Test getting thread affinity masks + const ncnn::CpuSet& mask_all = ncnn::get_cpu_thread_affinity_mask(0); + const ncnn::CpuSet& mask_little = ncnn::get_cpu_thread_affinity_mask(1); + const ncnn::CpuSet& mask_big = ncnn::get_cpu_thread_affinity_mask(2); + + printf("All cores mask enabled CPUs: %d\n", mask_all.num_enabled()); + printf("Little cores mask enabled CPUs: %d\n", mask_little.num_enabled()); + printf("Big cores mask enabled CPUs: %d\n", mask_big.num_enabled()); + + // Test setting thread affinity + ncnn::CpuSet custom_mask; + int cpu_count = ncnn::get_cpu_count(); + + // Enable every other CPU + for (int i = 0; i < cpu_count; i += 2) + { + custom_mask.enable(i); + } + + printf("Setting custom affinity with %d CPUs...\n", custom_mask.num_enabled()); + int result = ncnn::set_cpu_thread_affinity(custom_mask); + + if (result == 0) + { + printf("Thread affinity set successfully\n"); + } + else + { + printf("Thread affinity setting failed with code: %d\n", result); + } + + return 0; +} + +int main() +{ + printf("NCNN CPU Core Support Test for Windows 64+ Cores\n"); + printf("================================================\n"); + + int result = 0; + + result |= test_basic_cpu_info(); + result |= test_windows_api_comparison(); + result |= test_cpuset_basic_operations(); + result |= test_cpuset_large_core_numbers(); + +#ifdef _WIN32 + result |= test_windows_specific_features(); +#endif + + result |= test_thread_affinity(); + + print_separator("Test Summary"); + if (result == 0) + { + printf("All tests PASSED\n"); + } + else + { + printf("Some tests FAILED (return code: %d)\n", result); + } + + return result; +} \ No newline at end of file