1616
1717#if !NCNN_SIMPLESTL
1818#include < algorithm>
19- #include < cstdint >
19+ #include < stdint.h >
2020#include < utility>
2121#include < vector>
2222#endif
@@ -1767,7 +1767,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp
17671767 if (glpie != NULL )
17681768 {
17691769 DWORD bufferSize = 0 ;
1770- glpie (RelationProcessorCore, nullptr , &bufferSize);
1770+ glpie (RelationProcessorCore, NULL , &bufferSize);
17711771 std::vector<BYTE> buffer (bufferSize);
17721772 if (!GetLogicalProcessorInformationEx (RelationProcessorCore,
17731773 (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(buffer.data ()), &bufferSize))
@@ -2418,15 +2418,15 @@ namespace ncnn {
24182418
24192419// New unified CpuSet implementation supporting >64 CPUs
24202420CpuSet::CpuSet ()
2421- : fast_mask(0 ), extended_mask(nullptr ), extended_capacity(0 ), use_extended(false )
2421+ : fast_mask(0 ), extended_mask(NULL ), extended_capacity(0 ), use_extended(false )
24222422#if defined _WIN32
24232423 ,
24242424 legacy_mask_cache (0 ),
24252425 legacy_mask_valid(false )
24262426#endif
24272427#if defined __ANDROID__ || defined __linux__
24282428 ,
2429- cpu_set_cache (nullptr ),
2429+ cpu_set_cache (NULL ),
24302430 cpu_set_valid(false )
24312431#endif
24322432#if __APPLE__
@@ -2438,15 +2438,15 @@ CpuSet::CpuSet()
24382438}
24392439
24402440CpuSet::CpuSet (const CpuSet& other)
2441- : fast_mask(0 ), extended_mask(nullptr ), extended_capacity(0 ), use_extended(false )
2441+ : fast_mask(0 ), extended_mask(NULL ), extended_capacity(0 ), use_extended(false )
24422442#if defined _WIN32
24432443 ,
24442444 legacy_mask_cache (0 ),
24452445 legacy_mask_valid(false )
24462446#endif
24472447#if defined __ANDROID__ || defined __linux__
24482448 ,
2449- cpu_set_cache (nullptr ),
2449+ cpu_set_cache (NULL ),
24502450 cpu_set_valid(false )
24512451#endif
24522452#if __APPLE__
@@ -2487,7 +2487,7 @@ void CpuSet::copy_from(const CpuSet& other)
24872487 if (extended_mask)
24882488 {
24892489 free (extended_mask);
2490- extended_mask = nullptr ;
2490+ extended_mask = NULL ;
24912491 }
24922492 extended_capacity = 0 ;
24932493
@@ -2515,7 +2515,7 @@ void CpuSet::copy_from(const CpuSet& other)
25152515 if (cpu_set_cache)
25162516 {
25172517 CPU_FREE (cpu_set_cache);
2518- cpu_set_cache = nullptr ;
2518+ cpu_set_cache = NULL ;
25192519 }
25202520#endif
25212521#if __APPLE__
@@ -2666,18 +2666,20 @@ bool CpuSet::is_enabled(int cpu) const
26662666// Helper function to count bits in a 64-bit integer
26672667static int popcount64 (uint64_t x)
26682668{
2669- #if defined(__GNUC__) || defined(__clang__)
2670- return __builtin_popcountll (x);
2671- #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
2669+ #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
26722670 // __popcnt64 is only available on x86/x64, not on ARM
26732671 return (int )__popcnt64 (x);
2672+ #elif (defined(__GNUC__) || defined(__clang__)) && defined(__POPCNT__) && !defined(__FREESTANDING__) && !NCNN_SIMPLESTL
2673+ // Only use builtin if POPCNT instruction is available
2674+ return __builtin_popcountll (x);
26742675#else
2675- // Fallback implementation for ARM and other architectures
2676+ // Fallback implementation for compatibility
2677+ // Use Brian Kernighan's algorithm for better performance
26762678 int count = 0 ;
26772679 while (x)
26782680 {
2679- count + = x & 1 ;
2680- x >>= 1 ;
2681+ x & = x - 1 ; // Clear the lowest set bit
2682+ count++ ;
26812683 }
26822684 return count;
26832685#endif
@@ -2835,7 +2837,7 @@ const cpu_set_t* CpuSet::get_cpu_set() const
28352837 {
28362838 cpu_set_cache = CPU_ALLOC (CPU_SETSIZE);
28372839 if (!cpu_set_cache)
2838- return nullptr ;
2840+ return NULL ;
28392841 }
28402842
28412843 CPU_ZERO_S (CPU_ALLOC_SIZE (CPU_SETSIZE), cpu_set_cache);
0 commit comments