@@ -42,11 +42,19 @@ static int avx_component_register(void);
4242
4343#include <immintrin.h>
4444
45- static int has_intel_AVX512f_features (void )
45+ static uint32_t has_intel_AVX_features (void )
4646{
47- const unsigned long avx512_features = _FEATURE_AVX512F ;
47+ uint32_t flags = 0 ;
4848
49- return _may_i_use_cpu_feature ( avx512_features );
49+ flags |= _may_i_use_cpu_feature (_FEATURE_AVX512F ) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0 ;
50+ flags |= _may_i_use_cpu_feature (_FEATURE_AVX512BW ) ? OMPI_OP_AVX_HAS_AVX512FBW_FLAG : 0 ;
51+ flags |= _may_i_use_cpu_feature (_FEATURE_AVX2 ) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0 ;
52+ flags |= _may_i_use_cpu_feature (_FEATURE_AVX ) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0 ;
53+ flags |= _may_i_use_cpu_feature (_FEATURE_SSE4_1 ) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0 ;
54+ flags |= _may_i_use_cpu_feature (_FEATURE_SSE3 ) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0 ;
55+ flags |= _may_i_use_cpu_feature (_FEATURE_SSE2 ) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0 ;
56+ flags |= _may_i_use_cpu_feature (_FEATURE_SSE ) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0 ;
57+ return flags ;
5058}
5159#else /* non-Intel compiler */
5260#include <stdint.h>
@@ -60,35 +68,49 @@ static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
6068#if defined(_MSC_VER )
6169 __cpuidex (abcd , eax , ecx );
6270#else
63- uint32_t ebx , edx ;
71+ uint32_t ebx = 0 , edx = 0 ;
6472#if defined( __i386__ ) && defined ( __PIC__ )
6573 /* in case of PIC under 32-bit EBX cannot be clobbered */
6674 __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx ),
67- "+a" (eax ), "=c" (ecx ), "=d" (edx ) );
6875#else
69- __asm__ ( "cpuid" : "=b" (ebx ),
70- "+a" (eax ), "+c" (ecx ), "=d" (edx ) );
76+ __asm__ ( "cpuid" : "+b" (ebx ),
7177#endif /* defined( __i386__ ) && defined ( __PIC__ ) */
72- abcd [0 ] = eax ; abcd [1 ] = ebx ; abcd [3 ] = ecx ; abcd [3 ] = edx ;
78+ "+a" (eax ), "+c" (ecx ), "=d" (edx ) );
79+ abcd [0 ] = eax ; abcd [1 ] = ebx ; abcd [2 ] = ecx ; abcd [3 ] = edx ;
7380#endif
7481}
7582
76- static int has_intel_AVX512f_features (void )
83+ static uint32_t has_intel_AVX_features (void )
7784{
78- uint32_t abcd [4 ];
79- //uint32_t avx2_mask = (1 << 5); // AVX2
80- uint32_t avx2f_mask = (1 << 16 ); // AVX2F
85+ /* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
86+ const uint32_t avx512f_mask = (1U << 16 ); // AVX512F (EAX = 7, ECX = 0) : EBX
87+ const uint32_t avx512_bw_mask = (1U << 30 ); // AVX512BW (EAX = 7, ECX = 0) : EBX
88+ const uint32_t avx2_mask = (1U << 5 ); // AVX2 (EAX = 7, ECX = 0) : EBX
89+ const uint32_t avx_mask = (1U << 28 ); // AVX (EAX = 1, ECX = 0) : ECX
90+ const uint32_t sse4_1_mask = (1U << 19 ); // SSE4.1 (EAX = 1, ECX = 0) : ECX
91+ const uint32_t sse3_mask = (1U << 0 ); // SSE3 (EAX = 1, ECX = 0) : ECX
92+ const uint32_t sse2_mask = (1U << 26 ); // SSE2 (EAX = 1, ECX = 0) : EDX
93+ const uint32_t sse_mask = (1U << 15 ); // SSE (EAX = 1, ECX = 0) : EDX
94+ uint32_t flags = 0 , abcd [4 ];
8195
96+ run_cpuid ( 1 , 0 , abcd );
97+ flags |= (abcd [2 ] & avx_mask ) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0 ;
98+ flags |= (abcd [2 ] & sse4_1_mask ) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0 ;
99+ flags |= (abcd [2 ] & sse3_mask ) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0 ;
100+ flags |= (abcd [3 ] & sse2_mask ) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0 ;
101+ flags |= (abcd [3 ] & sse_mask ) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0 ;
82102#if defined(__APPLE__ )
83- uint32_t osxsave_mask = (1 << 27 ); // OSX.
84- run_cpuid ( 1 , 0 , abcd );
85- // OS supports extended processor state management ?
86- if ( (abcd [2 ] & osxsave_mask ) != osxsave_mask )
87- return 0 ;
103+ uint32_t fma_movbe_osxsave_mask = ((1U << 12 ) | (1U << 22 ) | (1U << 27 )); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
104+ // OS supports extended processor state management ?
105+ if ( (abcd [2 ] & fma_movbe_osxsave_mask ) != fma_movbe_osxsave_mask )
106+ return 0 ;
88107#endif /* defined(__APPLE__) */
89108
90- run_cpuid ( 7 , 0 , abcd );
91- return ((abcd [1 ] & avx2f_mask ) == avx2f_mask );
109+ run_cpuid ( 7 , 0 , abcd );
110+ flags |= (abcd [1 ] & avx512f_mask ) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0 ;
111+ flags |= (abcd [1 ] & avx512_bw_mask ) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0 ;
112+ flags |= (abcd [1 ] & avx2_mask ) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0 ;
113+ return flags ;
92114}
93115#endif /* non-Intel compiler */
94116
@@ -119,16 +141,14 @@ ompi_op_avx_component_t mca_op_avx_component = {
119141 */
120142static int avx_component_open (void )
121143{
122- /* A first level check to see if avx is even available in this
123- process. E.g., you may want to do a first-order check to see
124- if hardware is available. If so, return OMPI_SUCCESS. If not,
125- return anything other than OMPI_SUCCESS and the component will
126- silently be ignored.
127-
128- Note that if this function returns non-OMPI_SUCCESS, then this
129- component won't even be shown in ompi_info output (which is
130- probably not what you want).
131- */
144+ mca_op_avx_component .flags = has_intel_AVX_features ();
145+ /* A first level check to see what level of AVX is available on the
146+ * hardware.
147+ *
148+ * Note that if this function returns non-OMPI_SUCCESS, then this
149+ * component won't even be shown in ompi_info output (which is
150+ * probably not what you want).
151+ */
132152 return OMPI_SUCCESS ;
133153}
134154
@@ -153,26 +173,36 @@ static int avx_component_close(void)
153173static int
154174avx_component_register (void )
155175{
156- mca_op_avx_component .double_supported = true;
176+ int32_t requested_flags ;
177+ requested_flags = mca_op_avx_component .flags = has_intel_AVX_features ();
157178 (void ) mca_base_component_var_register (& mca_op_avx_component .super .opc_version ,
158- "double_supported " ,
159- "Whether the double precision data types are supported or not " ,
160- MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 ,
179+ "support " ,
180+ "Level of SSE/MMX/AVX support to be used (combination of processor capabilities as follow SSE 0x01, SSE2 0x02, SSE3 0x04, SSE4.1 0x08, AVX 0x010, AVX2 0x020, AVX512F 0x100, AVX512BW 0x200) capped by the local architecture capabilities " ,
181+ MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
161182 OPAL_INFO_LVL_9 ,
162- MCA_BASE_VAR_SCOPE_READONLY ,
163- & mca_op_avx_component .double_supported );
164-
183+ MCA_BASE_VAR_SCOPE_LOCAL ,
184+ & mca_op_avx_component .flags );
185+ mca_op_avx_component . flags &= requested_flags ;
165186 return OMPI_SUCCESS ;
166187}
188+ #define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200
189+ #define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100
190+ #define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020
191+ #define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010
192+ #define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008
193+ #define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004
194+ #define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002
195+ #define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001
196+
167197
168198/*
169199 * Query whether this component wants to be used in this process.
170200 */
171201static int
172202avx_component_init_query (bool enable_progress_threads ,
173- bool enable_mpi_thread_multiple )
203+ bool enable_mpi_thread_multiple )
174204{
175- if ( ! has_intel_AVX512f_features () )
205+ if ( 0 == mca_op_avx_component . flags )
176206 return OMPI_ERR_NOT_SUPPORTED ;
177207 return OMPI_SUCCESS ;
178208}
@@ -202,10 +232,10 @@ avx_component_op_query(struct ompi_op_t *op, int *priority)
202232 case OMPI_OP_BASE_FORTRAN_BXOR :
203233 module = OBJ_NEW (ompi_op_base_module_t );
204234 for (int i = 0 ; i < OMPI_OP_BASE_TYPE_MAX ; ++ i ) {
205- module -> opm_fns [i ] = ompi_op_avx_functions [op -> o_f_to_c_index ][i ];
206- OBJ_RETAIN (module );
235+ module -> opm_fns [i ] = ompi_op_avx_functions [op -> o_f_to_c_index ][i ];
236+ OBJ_RETAIN (module );
207237 module -> opm_3buff_fns [i ] = ompi_op_avx_3buff_functions [op -> o_f_to_c_index ][i ];
208- OBJ_RETAIN (module );
238+ OBJ_RETAIN (module );
209239 }
210240 break ;
211241 case OMPI_OP_BASE_FORTRAN_LAND :
0 commit comments