Skip to content

Commit 9ab2fbe

Browse files
committed
Enable support for Intel Vector extensions.
Configure the MPI_Op support according to the architecture capabilities. On Intel processors detect at runtime the level of hardware support for vector instructions and use them accordingly (SSE/AVX/AVX2/AVX512). Signed-off-by: George Bosilca <[email protected]>
1 parent 37a96e9 commit 9ab2fbe

File tree

6 files changed

+988
-656
lines changed

6 files changed

+988
-656
lines changed

ompi/mca/op/avx/configure.m4

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
2020
AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
2121

2222
op_avx_support=0
23+
op_avx2_support=0
24+
op_avx512_support=0
2325
OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
2426

2527
AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
@@ -34,11 +36,11 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
3436
__m512 vA, vB;
3537
_mm512_add_ps(vA, vB)
3638
]])],
37-
[op_avx_support=1
39+
[op_avx512_support=1
3840
AC_MSG_RESULT([yes])],
3941
[AC_MSG_RESULT([no])])
4042

41-
AS_IF([test $op_avx_support -eq 0],
43+
AS_IF([test $op_avx512_support -eq 0],
4244
[AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
4345
op_avx_cflags_save="$CFLAGS"
4446
CFLAGS="$CFLAGS -march=skylake-avx512"
@@ -48,18 +50,97 @@ AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
4850
__m512 vA, vB;
4951
_mm512_add_ps(vA, vB)
5052
]])],
51-
[op_avx_support=1
53+
[op_avx512_support=1
5254
op_avx_CPPFLAGS="-march=skylake-avx512"
5355
AC_MSG_RESULT([yes])],
5456
[AC_MSG_RESULT([no])])
5557
CFLAGS="$op_avx_cflags_save"
5658
])
59+
#
60+
# No support for the AVX512 instruction set. Let's see if we can fall back
61+
# to an earlier instruction set (AVX2).
62+
#
63+
AS_IF([test $op_avx512_support -eq 0],
64+
[AC_MSG_CHECKING([for AVX2 support (no additional flags)])
65+
op_avx_cflags_save="$CFLAGS"
66+
AC_LINK_IFELSE(
67+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
68+
[[
69+
__m256 vA, vB;
70+
_mm256_add_ps(vA, vB)
71+
]])],
72+
[op_avx2_support=1
73+
AC_MSG_RESULT([yes])],
74+
[AC_MSG_RESULT([no])])
75+
CFLAGS="$op_avx_cflags_save"
76+
AS_IF([test $op_avx2_support -eq 0],
77+
[AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
78+
op_avx_cflags_save="$CFLAGS"
79+
CFLAGS="$CFLAGS -mavx2"
80+
AC_LINK_IFELSE(
81+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
82+
[[
83+
__m256 vA, vB;
84+
_mm256_add_ps(vA, vB)
85+
]])],
86+
[op_avx2_support=1
87+
op_avx_CPPFLAGS="-mavx2"
88+
AC_MSG_RESULT([yes])],
89+
[AC_MSG_RESULT([no])])
90+
CFLAGS="$op_avx_cflags_save"
91+
])
92+
],
93+
[AC_MSG_NOTICE([Assume support for AVX2 (implied by support for AVX512)])
94+
op_avx2_support=1])
95+
#
96+
# No support for the AVX512 nor AVX2 instruction sets. Fall back
97+
# to an even earlier instruction set (AVX).
98+
#
99+
AS_IF([test $op_avx2_support -eq 0],
100+
[AC_MSG_CHECKING([for AVX support (no additional flags)])
101+
op_avx_cflags_save="$CFLAGS"
102+
AC_LINK_IFELSE(
103+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
104+
[[
105+
__m128 vA, vB;
106+
_mm128_add_ps(vA, vB)
107+
]])],
108+
[op_avx_support=1
109+
AC_MSG_RESULT([yes])],
110+
[AC_MSG_RESULT([no])])
111+
CFLAGS="$op_avx_cflags_save"
112+
AS_IF([test $op_avx_support -eq 0],
113+
[AC_MSG_CHECKING([for AVX support (with -mavx)])
114+
op_avx_cflags_save="$CFLAGS"
115+
CFLAGS="$CFLAGS -mavx"
116+
AC_LINK_IFELSE(
117+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
118+
[[
119+
__m128 vA, vB;
120+
_mm128_add_ps(vA, vB)
121+
]])],
122+
[op_avx_support=1
123+
op_avx_CPPFLAGS="-mavx"
124+
AC_MSG_RESULT([yes])],
125+
[AC_MSG_RESULT([no])])
126+
CFLAGS="$op_avx_cflags_save"
127+
])
128+
],
129+
[AC_MSG_NOTICE([Assume support for AVX (implied by support for AVX2)])
130+
op_avx_support=1])
131+
57132
AC_LANG_POP([C])
58133
])
59134

60135
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
136+
[$op_avx512_support],
137+
[Whetever AVX512 is supported in the current build])
138+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX2],
139+
[$op_avx2_support],
140+
[Whetever AVX2 is supported in the current build])
141+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX],
61142
[$op_avx_support],
62-
[Whetever AVX512 is supported in the current compilation context])
143+
[Whetever AVX is supported in the current build])
63144
AC_SUBST([op_avx_CPPFLAGS])
64145
OPAL_VAR_SCOPE_POP
65146
AS_IF([test $op_avx_support -eq 1],

ompi/mca/op/avx/op_avx.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@
2121

2222
BEGIN_C_DECLS
2323

24+
#define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200
25+
#define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100
26+
#define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020
27+
#define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010
28+
#define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008
29+
#define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004
30+
#define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002
31+
#define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001
32+
2433
/**
2534
* Derive a struct from the base op component struct, allowing us to
2635
* cache some component-specific information on our well-known
@@ -37,9 +46,7 @@ typedef struct {
3746
avxs; replace them with whatever is relevant for your
3847
component. */
3948

40-
/** A simple boolean indicating whether double precision is
41-
supported. */
42-
bool double_supported;
49+
uint32_t flags; /* AVX capabilities supported by the processor */
4350
} ompi_op_avx_component_t;
4451

4552
/**

ompi/mca/op/avx/op_avx_component.c

Lines changed: 71 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,19 @@ static int avx_component_register(void);
4242

4343
#include <immintrin.h>
4444

45-
static int has_intel_AVX512f_features(void)
45+
static uint32_t has_intel_AVX_features(void)
4646
{
47-
const unsigned long avx512_features = _FEATURE_AVX512F;
47+
uint32_t flags = 0;
4848

49-
return _may_i_use_cpu_feature( avx512_features );
49+
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
50+
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512FBW_FLAG : 0;
51+
flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
52+
flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
53+
flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
54+
flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
55+
flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
56+
flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
57+
return flags;
5058
}
5159
#else /* non-Intel compiler */
5260
#include <stdint.h>
@@ -60,35 +68,49 @@ static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
6068
#if defined(_MSC_VER)
6169
__cpuidex(abcd, eax, ecx);
6270
#else
63-
uint32_t ebx, edx;
71+
uint32_t ebx = 0, edx = 0;
6472
#if defined( __i386__ ) && defined ( __PIC__ )
6573
/* in case of PIC under 32-bit EBX cannot be clobbered */
6674
__asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
67-
"+a" (eax), "=c" (ecx), "=d" (edx) );
6875
#else
69-
__asm__ ( "cpuid" : "=b" (ebx),
70-
"+a" (eax), "+c" (ecx), "=d" (edx) );
76+
__asm__ ( "cpuid" : "+b" (ebx),
7177
#endif /* defined( __i386__ ) && defined ( __PIC__ ) */
72-
abcd[0] = eax; abcd[1] = ebx; abcd[3] = ecx; abcd[3] = edx;
78+
"+a" (eax), "+c" (ecx), "=d" (edx) );
79+
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
7380
#endif
7481
}
7582

76-
static int has_intel_AVX512f_features(void)
83+
static uint32_t has_intel_AVX_features(void)
7784
{
78-
uint32_t abcd[4];
79-
//uint32_t avx2_mask = (1 << 5); // AVX2
80-
uint32_t avx2f_mask = (1 << 16); // AVX2F
85+
/* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
86+
const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX
87+
const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX
88+
const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX
89+
const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX
90+
const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX
91+
const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX
92+
const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX
93+
const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX
94+
uint32_t flags = 0, abcd[4];
8195

96+
run_cpuid( 1, 0, abcd );
97+
flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
98+
flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
99+
flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
100+
flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
101+
flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
82102
#if defined(__APPLE__)
83-
uint32_t osxsave_mask = (1 << 27); // OSX.
84-
run_cpuid( 1, 0, abcd );
85-
// OS supports extended processor state management ?
86-
if ( (abcd[2] & osxsave_mask) != osxsave_mask )
87-
return 0;
103+
uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
104+
// OS supports extended processor state management ?
105+
if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
106+
return 0;
88107
#endif /* defined(__APPLE__) */
89108

90-
run_cpuid( 7, 0, abcd );
91-
return ((abcd[1] & avx2f_mask) == avx2f_mask);
109+
run_cpuid( 7, 0, abcd );
110+
flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
111+
flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
112+
flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
113+
return flags;
92114
}
93115
#endif /* non-Intel compiler */
94116

@@ -119,16 +141,14 @@ ompi_op_avx_component_t mca_op_avx_component = {
119141
*/
120142
static int avx_component_open(void)
121143
{
122-
/* A first level check to see if avx is even available in this
123-
process. E.g., you may want to do a first-order check to see
124-
if hardware is available. If so, return OMPI_SUCCESS. If not,
125-
return anything other than OMPI_SUCCESS and the component will
126-
silently be ignored.
127-
128-
Note that if this function returns non-OMPI_SUCCESS, then this
129-
component won't even be shown in ompi_info output (which is
130-
probably not what you want).
131-
*/
144+
mca_op_avx_component.flags = has_intel_AVX_features();
145+
/* A first level check to see what level of AVX is available on the
146+
* hardware.
147+
*
148+
* Note that if this function returns non-OMPI_SUCCESS, then this
149+
* component won't even be shown in ompi_info output (which is
150+
* probably not what you want).
151+
*/
132152
return OMPI_SUCCESS;
133153
}
134154

@@ -153,26 +173,36 @@ static int avx_component_close(void)
153173
static int
154174
avx_component_register(void)
155175
{
156-
mca_op_avx_component.double_supported = true;
176+
int32_t requested_flags;
177+
requested_flags = mca_op_avx_component.flags = has_intel_AVX_features();
157178
(void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
158-
"double_supported",
159-
"Whether the double precision data types are supported or not",
160-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
179+
"support",
180+
"Level of SSE/MMX/AVX support to be used (combination of processor capabilities as follow SSE 0x01, SSE2 0x02, SSE3 0x04, SSE4.1 0x08, AVX 0x010, AVX2 0x020, AVX512F 0x100, AVX512BW 0x200) capped by the local architecture capabilities",
181+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
161182
OPAL_INFO_LVL_9,
162-
MCA_BASE_VAR_SCOPE_READONLY,
163-
&mca_op_avx_component.double_supported);
164-
183+
MCA_BASE_VAR_SCOPE_LOCAL,
184+
&mca_op_avx_component.flags);
185+
mca_op_avx_component.flags &= requested_flags;
165186
return OMPI_SUCCESS;
166187
}
188+
#define OMPI_OP_AVX_HAS_AVX512BW_FLAG 0x00000200
189+
#define OMPI_OP_AVX_HAS_AVX512F_FLAG 0x00000100
190+
#define OMPI_OP_AVX_HAS_AVX2_FLAG 0x00000020
191+
#define OMPI_OP_AVX_HAS_AVX_FLAG 0x00000010
192+
#define OMPI_OP_AVX_HAS_SSE4_1_FLAG 0x00000008
193+
#define OMPI_OP_AVX_HAS_SSE3_FLAG 0x00000004
194+
#define OMPI_OP_AVX_HAS_SSE2_FLAG 0x00000002
195+
#define OMPI_OP_AVX_HAS_SSE_FLAG 0x00000001
196+
167197

168198
/*
169199
* Query whether this component wants to be used in this process.
170200
*/
171201
static int
172202
avx_component_init_query(bool enable_progress_threads,
173-
bool enable_mpi_thread_multiple)
203+
bool enable_mpi_thread_multiple)
174204
{
175-
if( !has_intel_AVX512f_features() )
205+
if( 0 == mca_op_avx_component.flags )
176206
return OMPI_ERR_NOT_SUPPORTED;
177207
return OMPI_SUCCESS;
178208
}
@@ -202,10 +232,10 @@ avx_component_op_query(struct ompi_op_t *op, int *priority)
202232
case OMPI_OP_BASE_FORTRAN_BXOR:
203233
module = OBJ_NEW(ompi_op_base_module_t);
204234
for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
205-
module->opm_fns[i] = ompi_op_avx_functions[op->o_f_to_c_index][i];
206-
OBJ_RETAIN(module);
235+
module->opm_fns[i] = ompi_op_avx_functions[op->o_f_to_c_index][i];
236+
OBJ_RETAIN(module);
207237
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions[op->o_f_to_c_index][i];
208-
OBJ_RETAIN(module);
238+
OBJ_RETAIN(module);
209239
}
210240
break;
211241
case OMPI_OP_BASE_FORTRAN_LAND:

0 commit comments

Comments
 (0)