diff --git a/src/cpu.cpp b/src/cpu.cpp
index a095b6b6f5c0..b7ebbba57b6f 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -875,9 +875,50 @@ static int get_cpucount()
     else
         count = 1;
 #elif defined _WIN32
-    SYSTEM_INFO system_info;
-    GetSystemInfo(&system_info);
-    count = system_info.dwNumberOfProcessors;
+    typedef BOOL(WINAPI * LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
+    LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
+    if (glpiex != NULL)
+    {
+        DWORD length = 0;
+        glpiex(RelationAll, NULL, &length);
+
+        if (length > 0)
+        {
+            PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(length);
+
+            if (buffer && glpiex(RelationAll, buffer, &length))
+            {
+                count = 0;
+                PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = buffer;
+                DWORD offset = 0;
+
+                while (offset < length)
+                {
+                    if (ptr->Relationship == RelationProcessorCore)
+                    {
+                        for (WORD i = 0; i < ptr->Processor.GroupCount; i++)
+                        {
+                            count += _popcnt64(ptr->Processor.GroupMask[i].Mask);
+                        }
+                    }
+                    offset += ptr->Size;
+                    ptr = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)ptr + ptr->Size);
+                }
+            }
+
+            if (buffer)
+            {
+                free(buffer);
+            }
+        }
+    }
+    //If cpu's count <= 64, use the previouse version.
+    if (count == 0)
+    {
+        SYSTEM_INFO system_info;
+        GetSystemInfo(&system_info);
+        count = system_info.dwNumberOfProcessors;
+    }
 #elif defined __ANDROID__ || defined __linux__
     // get cpu count from /proc/cpuinfo
     FILE* fp = fopen("/proc/cpuinfo", "rb");
@@ -1352,6 +1393,57 @@ static ncnn::CpuSet get_smt_cpu_mask()
 {
     ncnn::CpuSet smt_cpu_mask;
 
+    typedef BOOL(WINAPI * LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
+    LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
+    if (glpiex != NULL) //CPU core > 64
+    {
+        DWORD length = 0;
+        glpiex(RelationProcessorCore, NULL, &length);
+
+        if (length > 0)
+        {
+            std::vector<char> buffer(length);
+            if (glpiex(RelationProcessorCore, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(), &length))
+            {
+                PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data();
+
+                while ((char*)current < buffer.data() + length)
+                {
+                    if (current->Relationship == RelationProcessorCore)
+                    {
+                        int total_logical_count = 0;
+                        for (WORD group = 0; group < current->Processor.GroupCount; group++)
+                        {
+                            total_logical_count += _popcnt64(current->Processor.GroupMask[group].Mask);
+                        }
+
+                        if (total_logical_count > 1)
+                        {
+                            for (WORD group = 0; group < current->Processor.GroupCount; group++)
+                            {
+                                KAFFINITY mask = current->Processor.GroupMask[group].Mask;
+                                for (int cpu = 0; cpu < 64 && mask; cpu++)
+                                {
+                                    if (mask & (1ULL << cpu))
+                                    {
+                                        int global_cpu = group * 64 + cpu;
+                                        smt_cpu_mask.enable(global_cpu);
+                                        mask &= ~(1ULL << cpu);
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)current + current->Size);
+                }
+
+                return smt_cpu_mask;
+            }
+        }
+    }
+
+    // Under 64, use the old API
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi == NULL)
@@ -1372,12 +1464,18 @@ static ncnn::CpuSet get_smt_cpu_mask()
     {
         if (ptr->Relationship == RelationProcessorCore)
         {
-            ncnn::CpuSet smt_set;
-            smt_set.mask = ptr->ProcessorMask;
-            if (smt_set.num_enabled() > 1)
+            int logical_count = _popcnt64(ptr->ProcessorMask);
+            if (logical_count > 1)
             {
-                // this core is smt
-                smt_cpu_mask.mask |= smt_set.mask;
+                ULONG_PTR mask = ptr->ProcessorMask;
+                for (int cpu = 0; cpu < 64 && mask; cpu++)
+                {
+                    if (mask & (1ULL << cpu))
+                    {
+                        smt_cpu_mask.enable(cpu);
+                        mask &= ~(1ULL << cpu);
+                    }
+                }
             }
         }
 
@@ -1386,7 +1484,6 @@ static ncnn::CpuSet get_smt_cpu_mask()
     }
 
     free(buffer);
-
     return smt_cpu_mask;
 }
 
@@ -1432,13 +1529,25 @@ static std::vector<int> get_max_freq_mhz()
 
 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
-    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
-    if (prev_mask == 0)
+    for (int group = 0; group < thread_affinity_mask.active_groups; group++)
     {
-        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
-        return -1;
+        if (thread_affinity_mask.masks[group] != 0)
+        {
+            GROUP_AFFINITY groupAffinity;
+            groupAffinity.Mask = thread_affinity_mask.masks[group];
+            groupAffinity.Group = (WORD)group;
+            groupAffinity.Reserved[0] = 0;
+            groupAffinity.Reserved[1] = 0;
+            groupAffinity.Reserved[2] = 0;
+
+            if (!SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL))
+            {
+                NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError());
+                return -1;
+            }
+            break;
+        }
     }
-
     return 0;
 }
 #endif // defined _WIN32
@@ -2273,34 +2382,82 @@ CpuSet::CpuSet()
 
 void CpuSet::enable(int cpu)
 {
-    mask |= ((ULONG_PTR)1 << cpu);
+    if (cpu < 0 || cpu >= max_cpus) return;
+
+    int group = cpu / 64;
+    int bit = cpu % 64;
+
+    if (group < MAX_CPU_GROUPS)
+    {
+        masks[group] |= (1ULL << bit);
+    }
 }
 
 void CpuSet::disable(int cpu)
 {
-    mask &= ~((ULONG_PTR)1 << cpu);
+    if (cpu < 0 || cpu >= max_cpus) return;
+
+    int group = cpu / 64;
+    int bit = cpu % 64;
+
+    if (group < MAX_CPU_GROUPS)
+    {
+        masks[group] &= ~(1ULL << bit);
+    }
 }
 
 void CpuSet::disable_all()
 {
-    mask = 0;
+    for (int i = 0; i < MAX_CPU_GROUPS; i++)
+    {
+        masks[i] = 0;
+    }
 }
 
 bool CpuSet::is_enabled(int cpu) const
 {
-    return mask & ((ULONG_PTR)1 << cpu);
+    if (cpu < 0 || cpu >= max_cpus) return false;
+
+    int group = cpu / 64;
+    int bit = cpu % 64;
+
+    if (group < MAX_CPU_GROUPS)
+    {
+        return (masks[group] & (1ULL << bit)) != 0;
+    }
+    return false;
 }
 
 int CpuSet::num_enabled() const
 {
-    int num_enabled = 0;
-    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
+    int count = 0;
+    for (int i = 0; i < MAX_CPU_GROUPS; i++)
     {
-        if (is_enabled(i))
-            num_enabled++;
+        count += __builtin_popcountll(masks[i]);
     }
+    return count;
+}
 
-    return num_enabled;
+ULONG_PTR CpuSet::get_group_mask(int group) const
+{
+    if (group < 0 || group >= MAX_CPU_GROUPS)
+    {
+        return 0;
+    }
+    return masks[group];
+}
+
+int CpuSet::get_active_group_count() const
+{
+    int count = 0;
+    for (int i = 0; i < MAX_CPU_GROUPS; i++)
+    {
+        if (masks[i] != 0)
+        {
+            count++;
+        }
+    }
+    return count;
 }
 #elif defined __ANDROID__ || defined __linux__
 CpuSet::CpuSet()
diff --git a/src/cpu.h b/src/cpu.h
index cbf417111f6d..1717904b2f32 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -28,9 +28,21 @@ class NCNN_EXPORT CpuSet
     bool is_enabled(int cpu) const;
     int num_enabled() const;
 
+#if defined _WIN32
+    int get_max_cpus() const
+    {
+        return max_cpus;
+    }
+    ULONG_PTR get_group_mask(int group) const;
+    int get_active_group_count() const;
+#endif
+
 public:
 #if defined _WIN32
-    ULONG_PTR mask;
+    static const int MAX_CPU_GROUPS = 20;
+    ULONG_PTR masks[MAX_CPU_GROUPS];
+    int max_cpus;
+    int active_groups;
 #endif
 #if defined __ANDROID__ || defined __linux__
     cpu_set_t cpu_set;
@@ -118,7 +130,6 @@ NCNN_EXPORT int cpu_support_riscv_xtheadvector();
 // vlenb = riscv vector length in bytes
 NCNN_EXPORT int cpu_riscv_vlenb();
 
-// cpu info
 NCNN_EXPORT int get_cpu_count();
 NCNN_EXPORT int get_little_cpu_count();
 NCNN_EXPORT int get_big_cpu_count();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 5a0940e88c6b..fa7a4d564413 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -166,3 +166,7 @@ ncnn_add_layer_test(Tile)
 ncnn_add_layer_test(UnaryOp)
 ncnn_add_layer_test(Unfold)
 ncnn_add_layer_test(Yolov3DetectionOutput)
+
+if(WIN32)
+    ncnn_add_test(ncnn_cpu_cores)
+endif()
diff --git a/tests/test_ncnn_cpu_cores.cpp b/tests/test_ncnn_cpu_cores.cpp
new file mode 100644
index 000000000000..6577dffb5416
--- /dev/null
+++ b/tests/test_ncnn_cpu_cores.cpp
@@ -0,0 +1,297 @@
+#include <stdio.h>
+#include <windows.h>
+#include <vector>
+#include <thread>
+#include <chrono>
+#include "cpu.h"
+
+static void print_separator(const char* title)
+{
+    printf("\n=== %s ===\n", title);
+}
+
+static int test_basic_cpu_info()
+{
+    print_separator("Basic CPU Information Test");
+
+    int cpu_count = ncnn::get_cpu_count();
+    int big_cpu_count = ncnn::get_big_cpu_count();
+    int little_cpu_count = ncnn::get_little_cpu_count();
+    int physical_cpu_count = ncnn::get_physical_cpu_count();
+
+    printf("CPU Count: %d\n", cpu_count);
+    printf("Big CPU Count: %d\n", big_cpu_count);
+    printf("Little CPU Count: %d\n", little_cpu_count);
+    printf("Physical CPU Count: %d\n", physical_cpu_count);
+
+    if (cpu_count <= 0)
+    {
+        printf("ERROR: Invalid CPU count\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_windows_api_comparison()
+{
+    print_separator("Windows API Comparison Test");
+
+    // Get ncnn detected CPU count
+    int ncnn_cpu_count = ncnn::get_cpu_count();
+
+    // Get Windows API CPU count
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    DWORD win_cpu_count = sysinfo.dwNumberOfProcessors;
+
+    printf("NCNN detected CPUs: %d\n", ncnn_cpu_count);
+    printf("Windows GetSystemInfo CPUs: %d\n", win_cpu_count);
+
+    // Test GetLogicalProcessorInformationEx for >64 core support
+    DWORD buffer_size = 0;
+    GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size);
+
+    if (buffer_size > 0)
+    {
+        std::vector<char> buffer(buffer_size);
+        if (GetLogicalProcessorInformationEx(RelationProcessorCore,
+                                             (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(), &buffer_size))
+        {
+            int core_count = 0;
+            int group_count = 0;
+            PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data();
+
+            while ((char*)current < buffer.data() + buffer_size)
+            {
+                if (current->Relationship == RelationProcessorCore)
+                {
+                    core_count++;
+                    group_count = max(group_count, (int)current->Processor.GroupCount);
+                }
+                current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)current + current->Size);
+            }
+
+            printf("GetLogicalProcessorInformationEx cores: %d\n", core_count);
+            printf("Processor groups detected: %d\n", group_count);
+
+            if (core_count > 64)
+            {
+                printf("SUCCESS: Detected >64 core system\n");
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int test_cpuset_basic_operations()
+{
+    print_separator("CpuSet Basic Operations Test");
+
+    ncnn::CpuSet cpuset;
+
+    // Test initial state
+    int initial_enabled = cpuset.num_enabled();
+    printf("Initial enabled CPUs: %d\n", initial_enabled);
+
+    // Test enabling specific CPUs
+    int cpu_count = ncnn::get_cpu_count();
+    for (int i = 0; i < min(cpu_count, 8); i++)
+    {
+        cpuset.enable(i);
+        if (!cpuset.is_enabled(i))
+        {
+            printf("ERROR: Failed to enable CPU %d\n", i);
+            return -1;
+        }
+    }
+
+    printf("Enabled first 8 CPUs, total enabled: %d\n", cpuset.num_enabled());
+
+    // Test disabling
+    cpuset.disable(0);
+    if (cpuset.is_enabled(0))
+    {
+        printf("ERROR: Failed to disable CPU 0\n");
+        return -1;
+    }
+
+    printf("Disabled CPU 0, total enabled: %d\n", cpuset.num_enabled());
+
+    // Test disable_all
+    cpuset.disable_all();
+    if (cpuset.num_enabled() != 0)
+    {
+        printf("ERROR: disable_all failed\n");
+        return -1;
+    }
+
+    printf("After disable_all, enabled CPUs: %d\n", cpuset.num_enabled());
+
+    return 0;
+}
+
+static int test_cpuset_large_core_numbers()
+{
+    print_separator("CpuSet Large Core Numbers Test");
+
+    ncnn::CpuSet cpuset;
+    int cpu_count = ncnn::get_cpu_count();
+
+    // Test enabling all available CPUs
+    for (int i = 0; i < cpu_count; i++)
+    {
+        cpuset.enable(i);
+    }
+
+    int enabled_count = cpuset.num_enabled();
+    printf("Enabled all %d CPUs, actual enabled: %d\n", cpu_count, enabled_count);
+
+    if (enabled_count != cpu_count)
+    {
+        printf("WARNING: Mismatch between expected and actual enabled CPUs\n");
+    }
+
+    // Test boundary conditions
+    if (cpu_count > 64)
+    {
+        printf("Testing >64 core boundary...\n");
+
+        cpuset.disable_all();
+
+        // Enable CPUs around the 64-core boundary
+        for (int i = 60; i < min(cpu_count, 68); i++)
+        {
+            cpuset.enable(i);
+            if (!cpuset.is_enabled(i))
+            {
+                printf("ERROR: Failed to enable CPU %d (around 64-core boundary)\n", i);
+                return -1;
+            }
+        }
+
+        printf("Successfully enabled CPUs around 64-core boundary\n");
+    }
+
+    return 0;
+}
+
+#ifdef _WIN32
+static int test_windows_specific_features()
+{
+    print_separator("Windows Specific Features Test");
+
+    ncnn::CpuSet cpuset;
+
+    // Test Windows-specific methods
+    int max_cpus = cpuset.get_max_cpus();
+    int active_groups = cpuset.get_active_group_count();
+
+    printf("Max CPUs: %d\n", max_cpus);
+    printf("Active processor groups: %d\n", active_groups);
+
+    // Test group masks
+    for (int group = 0; group < active_groups && group < 4; group++)
+    {
+        ULONG_PTR mask = cpuset.get_group_mask(group);
+        printf("Group %d mask: 0x%llx\n", group, (unsigned long long)mask);
+    }
+
+    // Test enabling CPUs in different groups
+    if (active_groups > 1)
+    {
+        printf("Testing multi-group CPU enabling...\n");
+
+        cpuset.disable_all();
+
+        // Enable some CPUs in group 0
+        for (int i = 0; i < min(4, max_cpus); i++)
+        {
+            cpuset.enable(i);
+        }
+
+        // Enable some CPUs in group 1 (if exists)
+        if (max_cpus > 64)
+        {
+            for (int i = 64; i < min(68, max_cpus); i++)
+            {
+                cpuset.enable(i);
+            }
+        }
+
+        printf("Multi-group test completed, enabled CPUs: %d\n", cpuset.num_enabled());
+    }
+
+    return 0;
+}
+#endif
+
+static int test_thread_affinity()
+{
+    print_separator("Thread Affinity Test");
+
+    // Test getting thread affinity masks
+    const ncnn::CpuSet& mask_all = ncnn::get_cpu_thread_affinity_mask(0);
+    const ncnn::CpuSet& mask_little = ncnn::get_cpu_thread_affinity_mask(1);
+    const ncnn::CpuSet& mask_big = ncnn::get_cpu_thread_affinity_mask(2);
+
+    printf("All cores mask enabled CPUs: %d\n", mask_all.num_enabled());
+    printf("Little cores mask enabled CPUs: %d\n", mask_little.num_enabled());
+    printf("Big cores mask enabled CPUs: %d\n", mask_big.num_enabled());
+
+    // Test setting thread affinity
+    ncnn::CpuSet custom_mask;
+    int cpu_count = ncnn::get_cpu_count();
+
+    // Enable every other CPU
+    for (int i = 0; i < cpu_count; i += 2)
+    {
+        custom_mask.enable(i);
+    }
+
+    printf("Setting custom affinity with %d CPUs...\n", custom_mask.num_enabled());
+    int result = ncnn::set_cpu_thread_affinity(custom_mask);
+
+    if (result == 0)
+    {
+        printf("Thread affinity set successfully\n");
+    }
+    else
+    {
+        printf("Thread affinity setting failed with code: %d\n", result);
+    }
+
+    return 0;
+}
+
+int main()
+{
+    printf("NCNN CPU Core Support Test for Windows 64+ Cores\n");
+    printf("================================================\n");
+
+    int result = 0;
+
+    result |= test_basic_cpu_info();
+    result |= test_windows_api_comparison();
+    result |= test_cpuset_basic_operations();
+    result |= test_cpuset_large_core_numbers();
+
+#ifdef _WIN32
+    result |= test_windows_specific_features();
+#endif
+
+    result |= test_thread_affinity();
+
+    print_separator("Test Summary");
+    if (result == 0)
+    {
+        printf("All tests PASSED\n");
+    }
+    else
+    {
+        printf("Some tests FAILED (return code: %d)\n", result);
+    }
+
+    return result;
+}
\ No newline at end of file