Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 180 additions & 23 deletions src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,9 +875,50 @@ static int get_cpucount()
else
count = 1;
#elif defined _WIN32
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
count = system_info.dwNumberOfProcessors;
typedef BOOL(WINAPI * LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
if (glpiex != NULL)
{
DWORD length = 0;
glpiex(RelationAll, NULL, &length);

if (length > 0)
{
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(length);

if (buffer && glpiex(RelationAll, buffer, &length))
{
count = 0;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = buffer;
DWORD offset = 0;

while (offset < length)
{
if (ptr->Relationship == RelationProcessorCore)
{
for (WORD i = 0; i < ptr->Processor.GroupCount; i++)
{
count += _popcnt64(ptr->Processor.GroupMask[i].Mask);
}
}
offset += ptr->Size;
ptr = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)ptr + ptr->Size);
}
}

if (buffer)
{
free(buffer);
}
}
}
//If cpu's count <= 64, use the previouse version.
if (count == 0)
{
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
count = system_info.dwNumberOfProcessors;
}
#elif defined __ANDROID__ || defined __linux__
// get cpu count from /proc/cpuinfo
FILE* fp = fopen("/proc/cpuinfo", "rb");
Expand Down Expand Up @@ -1352,6 +1393,57 @@ static ncnn::CpuSet get_smt_cpu_mask()
{
ncnn::CpuSet smt_cpu_mask;

typedef BOOL(WINAPI * LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
if (glpiex != NULL) //CPU core > 64
{
DWORD length = 0;
glpiex(RelationProcessorCore, NULL, &length);

if (length > 0)
{
std::vector<char> buffer(length);
if (glpiex(RelationProcessorCore, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(), &length))
{
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data();

while ((char*)current < buffer.data() + length)
{
if (current->Relationship == RelationProcessorCore)
{
int total_logical_count = 0;
for (WORD group = 0; group < current->Processor.GroupCount; group++)
{
total_logical_count += _popcnt64(current->Processor.GroupMask[group].Mask);
}

if (total_logical_count > 1)
{
for (WORD group = 0; group < current->Processor.GroupCount; group++)
{
KAFFINITY mask = current->Processor.GroupMask[group].Mask;
for (int cpu = 0; cpu < 64 && mask; cpu++)
{
if (mask & (1ULL << cpu))
{
int global_cpu = group * 64 + cpu;
smt_cpu_mask.enable(global_cpu);
mask &= ~(1ULL << cpu);
}
}
}
}
}

current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)current + current->Size);
}

return smt_cpu_mask;
}
}
}

// Under 64, use the old API
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (glpi == NULL)
Expand All @@ -1372,12 +1464,18 @@ static ncnn::CpuSet get_smt_cpu_mask()
{
if (ptr->Relationship == RelationProcessorCore)
{
ncnn::CpuSet smt_set;
smt_set.mask = ptr->ProcessorMask;
if (smt_set.num_enabled() > 1)
int logical_count = _popcnt64(ptr->ProcessorMask);
if (logical_count > 1)
{
// this core is smt
smt_cpu_mask.mask |= smt_set.mask;
ULONG_PTR mask = ptr->ProcessorMask;
for (int cpu = 0; cpu < 64 && mask; cpu++)
{
if (mask & (1ULL << cpu))
{
smt_cpu_mask.enable(cpu);
mask &= ~(1ULL << cpu);
}
}
}
}

Expand All @@ -1386,7 +1484,6 @@ static ncnn::CpuSet get_smt_cpu_mask()
}

free(buffer);

return smt_cpu_mask;
}

Expand Down Expand Up @@ -1432,13 +1529,25 @@ static std::vector<int> get_max_freq_mhz()

static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
if (prev_mask == 0)
for (int group = 0; group < thread_affinity_mask.active_groups; group++)
{
NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
return -1;
if (thread_affinity_mask.masks[group] != 0)
{
GROUP_AFFINITY groupAffinity;
groupAffinity.Mask = thread_affinity_mask.masks[group];
groupAffinity.Group = (WORD)group;
groupAffinity.Reserved[0] = 0;
groupAffinity.Reserved[1] = 0;
groupAffinity.Reserved[2] = 0;

if (!SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL))
{
NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError());
return -1;
}
break;
}
}

return 0;
}
#endif // defined _WIN32
Expand Down Expand Up @@ -2273,34 +2382,82 @@ CpuSet::CpuSet()

void CpuSet::enable(int cpu)
{
mask |= ((ULONG_PTR)1 << cpu);
if (cpu < 0 || cpu >= max_cpus) return;

int group = cpu / 64;
int bit = cpu % 64;

if (group < MAX_CPU_GROUPS)
{
masks[group] |= (1ULL << bit);
}
}

void CpuSet::disable(int cpu)
{
mask &= ~((ULONG_PTR)1 << cpu);
if (cpu < 0 || cpu >= max_cpus) return;

int group = cpu / 64;
int bit = cpu % 64;

if (group < MAX_CPU_GROUPS)
{
masks[group] &= ~(1ULL << bit);
}
}

void CpuSet::disable_all()
{
mask = 0;
for (int i = 0; i < MAX_CPU_GROUPS; i++)
{
masks[i] = 0;
}
}

bool CpuSet::is_enabled(int cpu) const
{
return mask & ((ULONG_PTR)1 << cpu);
if (cpu < 0 || cpu >= max_cpus) return false;

int group = cpu / 64;
int bit = cpu % 64;

if (group < MAX_CPU_GROUPS)
{
return (masks[group] & (1ULL << bit)) != 0;
}
return false;
}

int CpuSet::num_enabled() const
{
int num_enabled = 0;
for (int i = 0; i < (int)sizeof(mask) * 8; i++)
int count = 0;
for (int i = 0; i < MAX_CPU_GROUPS; i++)
{
if (is_enabled(i))
num_enabled++;
count += __builtin_popcountll(masks[i]);
}
return count;
}

return num_enabled;
ULONG_PTR CpuSet::get_group_mask(int group) const
{
if (group < 0 || group >= MAX_CPU_GROUPS)
{
return 0;
}
return masks[group];
}

int CpuSet::get_active_group_count() const
{
int count = 0;
for (int i = 0; i < MAX_CPU_GROUPS; i++)
{
if (masks[i] != 0)
{
count++;
}
}
return count;
}
#elif defined __ANDROID__ || defined __linux__
CpuSet::CpuSet()
Expand Down
15 changes: 13 additions & 2 deletions src/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,21 @@ class NCNN_EXPORT CpuSet
bool is_enabled(int cpu) const;
int num_enabled() const;

#if defined _WIN32
int get_max_cpus() const
{
return max_cpus;
}
ULONG_PTR get_group_mask(int group) const;
int get_active_group_count() const;
#endif

public:
#if defined _WIN32
ULONG_PTR mask;
static const int MAX_CPU_GROUPS = 20;
ULONG_PTR masks[MAX_CPU_GROUPS];
int max_cpus;
int active_groups;
#endif
#if defined __ANDROID__ || defined __linux__
cpu_set_t cpu_set;
Expand Down Expand Up @@ -118,7 +130,6 @@ NCNN_EXPORT int cpu_support_riscv_xtheadvector();
// vlenb = riscv vector length in bytes
NCNN_EXPORT int cpu_riscv_vlenb();

// cpu info
NCNN_EXPORT int get_cpu_count();
NCNN_EXPORT int get_little_cpu_count();
NCNN_EXPORT int get_big_cpu_count();
Expand Down
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,7 @@ ncnn_add_layer_test(Tile)
ncnn_add_layer_test(UnaryOp)
ncnn_add_layer_test(Unfold)
ncnn_add_layer_test(Yolov3DetectionOutput)

if(WIN32)
ncnn_add_test(ncnn_cpu_cores)
endif()
Loading
Loading