diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 4c6a7a7b4fa..649979746d6 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ -/* -** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). -*/ -int ompi_comm_split( ompi_communicator_t* comm, int color, int key, - ompi_communicator_t **newcomm, bool pass_on_topo ) + +int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ) { int myinfo[2]; int size, my_size; @@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d", newcomp->c_contextid, comm->c_contextid ); - + /* Copy info if there is one */ + if (info) { + newcomp->super.s_info = OBJ_NEW(opal_info_t); + opal_info_dup(info, &(newcomp->super.s_info)); + } /* Activate the communicator and init coll-component */ rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); @@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, } +/* +** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub). +*/ +int ompi_comm_split( ompi_communicator_t* comm, int color, int key, + ompi_communicator_t **newcomm, bool pass_on_topo ) +{ + return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo); +} + /**********************************************************************/ /**********************************************************************/ /**********************************************************************/ diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 8936b7f1df9..01c02614885 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm, OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key, ompi_communicator_t** newcomm, bool pass_on_topo); +/** + * split a communicator based on color and key. Parameters + * are identical to the MPI-counterpart of the function. + * Similar to \see ompi_comm_split with an additional info parameter. + * + * @param comm: input communicator + * @param color + * @param key + * + * @ + */ +OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, + opal_info_t *info, + ompi_communicator_t **newcomm, bool pass_on_topo ); + /** * split a communicator based on type and key. Parameters * are identical to the MPI-counterpart of the function. diff --git a/ompi/group/group.c b/ompi/group/group.c index f5cc88be98c..9e368c96da9 100644 --- a/ompi/group/group.c +++ b/ompi/group/group.c @@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group) return false; } + +/** + * Count the number of processes on this group that share the same node as + * this process. + */ +int ompi_group_count_local_peers (ompi_group_t *group) +{ + int local_peers = 0; + for (int i = 0 ; i < group->grp_proc_count ; ++i) { + ompi_proc_t *proc = NULL; +#if OMPI_GROUP_SPARSE + proc = ompi_group_peer_lookup (group, i); +#else + proc = ompi_group_get_proc_ptr_raw (group, i); + if (ompi_proc_is_sentinel (proc)) { + /* the proc must be stored in the group or cached in the proc + * hash table if the process resides in the local node + * (see ompi_proc_complete_init) */ + continue; + } +#endif + if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { + local_peers++; + } + } + + return local_peers; +} diff --git a/ompi/group/group.h b/ompi/group/group.h index 661666246e9..d1cf7d99ae8 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t return ompi_group_get_proc_ptr (group, peer_id, false); } +/** + * Return true if all processes in the group are not on the local node. + */ bool ompi_group_have_remote_peers (ompi_group_t *group); +/** + * Count the number of processes on the local node. + */ +int ompi_group_count_local_peers (ompi_group_t *group); + /** * Function to print the group info */ diff --git a/ompi/mca/coll/adapt/coll_adapt_ibcast.c b/ompi/mca/coll/adapt/coll_adapt_ibcast.c index b22982c0114..605d6262303 100644 --- a/ompi/mca/coll/adapt/coll_adapt_ibcast.c +++ b/ompi/mca/coll/adapt/coll_adapt_ibcast.c @@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req) || (context->con->tree->tree_nextsize > 0 && rank != context->con->root && num_sent == context->con->tree->tree_nextsize * context->con->num_segs && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } @@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req) && num_recv_fini == context->con->num_segs) || (context->con->tree->tree_nextsize == 0 && num_recv_fini == context->con->num_segs)) { - OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n", + OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n", ompi_comm_rank(context->con->comm))); ibcast_request_fini(context); } diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index 405bd6b388e..8c6023d411d 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -38,6 +38,7 @@ #include "mpi.h" #include "ompi/communicator/communicator.h" #include "opal/util/output.h" +#include "opal/util/argv.h" #include "opal/util/show_help.h" #include "opal/class/opal_list.h" #include "opal/class/opal_object.h" @@ -312,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a, return 0; } +static inline int +component_in_argv(char **argv, const char* component_name) +{ + if( NULL != argv ) { + while( NULL != *argv ) { + if( 0 == strcmp(component_name, *argv) ) { + return 1; + } + argv++; /* move to the next argument */ + } + } + return 0; +} + /* * For each module in the list, check and see if it wants to run, and * do the resulting priority comparison. Make a list of modules to be @@ -321,13 +336,66 @@ static int avail_coll_compare (opal_list_item_t **a, static opal_list_t *check_components(opal_list_t * components, ompi_communicator_t * comm) { - int priority; + int priority, flag; const mca_base_component_t *component; mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; opal_list_t *selectable; mca_coll_base_avail_coll_t *avail; - + char info_val[OPAL_MAX_INFO_VAL+1]; + char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL; + + /* Check if this communicator comes with restrictions on the collective modules + * it wants to use. The restrictions are consistent with the MCA parameter + * to limit the collective components loaded, but it applies for each + * communicator and is provided as an info key during the communicator + * creation. Unlike the MCA param, this info key is used not to select + * components but either to prevent components from being used or to + * force a change in the component priority. + */ + if( NULL != comm->super.s_info) { + opal_info_get(comm->super.s_info, "ompi_comm_coll_preference", + sizeof(info_val), info_val, &flag); + if( !flag ) { + goto proceed_to_select; + } + coll_argv = opal_argv_split(info_val, ','); + if(NULL == coll_argv) { + goto proceed_to_select; + } + int idx2, count_include = opal_argv_count(coll_argv); + /* Allocate the coll_include argv */ + coll_include = (char**)malloc((count_include + 1) * sizeof(char*)); + coll_include[count_include] = NULL; /* NULL terminated array */ + /* Dispatch the include/exclude in the corresponding arrays */ + for( int idx = 0; NULL != coll_argv[idx]; idx++ ) { + if( '^' == coll_argv[idx][0] ) { + coll_include[idx] = NULL; /* NULL terminated array */ + + /* Allocate the coll_exclude argv */ + coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*)); + /* save the exclude components */ + for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) { + coll_exclude[idx2 - idx] = coll_argv[idx2]; + } + coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */ + coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */ + count_include = idx; + break; + } + coll_include[idx] = coll_argv[idx]; + } + /* Reverse the order of the coll_inclide argv to faciliate the ordering of + * the selected components reverse. + */ + for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) { + char* temp = coll_include[idx2]; + coll_include[idx2] = coll_include[count_include - 1]; + coll_include[count_include - 1] = temp; + count_include--; + } + } + proceed_to_select: /* Make a list of the components that query successfully */ selectable = OBJ_NEW(opal_list_t); @@ -335,6 +403,13 @@ static opal_list_t *check_components(opal_list_t * components, OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) { component = cli->cli_component; + /* dont bother is we have this component in the exclusion list */ + if( component_in_argv(coll_exclude, component->mca_component_name) ) { + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:base:comm_select: component disqualified: %s (due to communicator info key)", + component->mca_component_name ); + continue; + } priority = check_one_component(comm, component, &module); if (priority >= 0) { /* We have a component that indicated that it wants to run @@ -370,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components, /* Put this list in priority order */ opal_list_sort(selectable, avail_coll_compare); + /* For all valid component reorder them not on their provided priorities but on + * the order requested in the info key. As at this point the coll_include is + * already ordered backward we can simply prepend the components. + */ + mca_coll_base_avail_coll_t *item, *item_next; + OPAL_LIST_FOREACH_SAFE(item, item_next, + selectable, mca_coll_base_avail_coll_t) { + if( component_in_argv(coll_include, item->ac_component_name) ) { + opal_list_remove_item(selectable, &item->super); + opal_list_prepend(selectable, &item->super); + } + } + + opal_argv_free(coll_argv); + if( NULL != coll_exclude ) { + free(coll_exclude); + } + if( NULL != coll_include ) { + free(coll_include); + } + /* All done */ return selectable; } @@ -403,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm, return priority; } - /************************************************************************** * Query functions **************************************************************************/ diff --git a/ompi/mca/coll/base/coll_base_util.c b/ompi/mca/coll/base/coll_base_util.c index 29b4a70caca..e6b1fde3d6e 100644 --- a/ompi/mca/coll/base/coll_base_util.c +++ b/ompi/mca/coll/base/coll_base_util.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -29,6 +29,8 @@ #include "ompi/mca/topo/base/base.h" #include "ompi/mca/pml/pml.h" #include "coll_base_util.h" +#include "coll_base_functions.h" +#include int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount, ompi_datatype_t* sdatatype, @@ -268,7 +270,7 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *req, } else { scount = rcount = OMPI_COMM_IS_INTER(comm)?ompi_comm_remote_size(comm):ompi_comm_size(comm); } - + for (int i=0; icb.req_complete_cb = NULL; req->req_complete_cb_data = NULL; req->data.objs.objs[0] = NULL; @@ -309,35 +312,249 @@ OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, N /* File reading functions */ static void skiptonewline (FILE *fptr, int *fileline) { - do { - char val; - int rc; + char val; + int rc; + do { rc = fread(&val, 1, 1, fptr); - if (0 == rc) return; - if ((1 == rc)&&('\n' == val)) { + if (0 == rc) { + return; + } + if ('\n' == val) { (*fileline)++; return; - } + } } while (1); } -long ompi_coll_base_file_getnext (FILE *fptr, int *fileline) +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val) { + char trash; + int rc; + do { - long val; - int rc; - char trash; - - rc = fscanf(fptr, "%li", &val); - if (rc == EOF) return MYEOF; - if (1 == rc) return val; - /* in all other cases, skip to the end */ + rc = fscanf(fptr, "%li", val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val) +{ + char trash, token[32]; + int rc; + + *val = NULL; /* security in case we fail */ + do { + rc = fscanf(fptr, "%32s", token); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + if( '#' == token[0] ) { + skiptonewline(fptr, fileline); + continue; + } + *val = (char*)malloc(strlen(token) + 1); + strcpy(*val, token); + return 0; + } + /* in all other cases, skip to the end of the token */ + rc = fread(&trash, sizeof(char), 1, fptr); + if (rc == EOF) { + return -1; + } + if ('\n' == trash) (*fileline)++; + if ('#' == trash) { + skiptonewline (fptr, fileline); + } + } while (1); +} + +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val) +{ + char trash; + int rc; + + do { + rc = fscanf(fptr, "%" PRIsize_t, val); + if (rc == EOF) { + return -1; + } + if (1 == rc) { + return 0; + } + /* in all other cases, skip to the end of the token */ rc = fread(&trash, sizeof(char), 1, fptr); - if (rc == EOF) return MYEOF; + if (rc == EOF) { + return -1; + } if ('\n' == trash) (*fileline)++; if ('#' == trash) { skiptonewline (fptr, fileline); - } + } + } while (1); +} + +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected) +{ + char trash; + int rc; + + do { + rc = fread(&trash, sizeof(char), 1, fptr); + if (0 == rc) { /* hit the end of the file */ + return -1; + } + if ('\n' == trash) { + (*fileline)++; + continue; + } + if ('#' == trash) { + skiptonewline (fptr, fileline); + continue; + } + if( trash == expected ) + return 1; /* return true and eat the char */ + if( isblank(trash) ) /* skip all spaces if that's not what we were looking for */ + continue; + if( 0 != fseek(fptr, -1, SEEK_CUR) ) + return -1; + return 0; } while (1); } + +/** + * There are certainly simpler implementation for this function when performance + * is not a critical point. But, as this function is used during the collective + * configuration, and we can do this configurations once for each communicator, + * I would rather have a more complex but faster implementation. + * The approach here is to search for the largest common denominators, to create + * something similar to a dichotomic search. + */ +int mca_coll_base_name_to_colltype(const char* name) +{ + if( 'n' == name[0] ) { + if( 0 == strncmp(name, "neighbor_all", 12) ) { + if( 't' != name[12] ) { + if( 0 == strncmp(name+12, "gather", 6) ) { + if('\0' == name[18]) return NEIGHBOR_ALLGATHER; + if( 'v' == name[18]) return NEIGHBOR_ALLGATHERV; + } + } else { + if( 0 == strncmp(name+12, "toall", 5) ) { + if( '\0' == name[17] ) return NEIGHBOR_ALLTOALL; + if( 'v' == name[17] ) return NEIGHBOR_ALLTOALLV; + if( 'w' == name[17] ) return NEIGHBOR_ALLTOALLW; + } + } + } + return -1; + } + if( 'a' == name[0] ) { + if( 0 != strncmp(name, "all", 3) ) { + return -1; + } + if( 't' != name[3] ) { + if( 'r' == name[3] ) { + if( 0 == strcmp(name+3, "reduce") ) + return ALLREDUCE; + } else { + if( 0 == strncmp(name+3, "gather", 6) ) { + if( '\0' == name[9] ) return ALLGATHER; + if( 'v' == name[9] ) return ALLGATHERV; + } + } + } else { + if( 0 == strncmp(name+3, "toall", 5) ) { + if( '\0' == name[8] ) return ALLTOALL; + if( 'v' == name[8] ) return ALLTOALLV; + if( 'w' == name[8] ) return ALLTOALLW; + } + } + return -1; + } + if( 'r' > name[0] ) { + if( 'b' == name[0] ) { + if( 0 == strcmp(name, "barrier") ) + return BARRIER; + if( 0 == strcmp(name, "bcast") ) + return BCAST; + } else if( 'g'== name[0] ) { + if( 0 == strncmp(name, "gather", 6) ) { + if( '\0' == name[6] ) return GATHER; + if( 'v' == name[6] ) return GATHERV; + } + } + if( 0 == strcmp(name, "exscan") ) + return EXSCAN; + return -1; + } + if( 's' > name[0] ) { + if( 0 == strncmp(name, "reduce", 6) ) { + if( '\0' == name[6] ) return REDUCE; + if( '_' == name[6] ) { + if( 0 == strncmp(name+7, "scatter", 7) ) { + if( '\0' == name[14] ) return REDUCESCATTER; + if( 0 == strcmp(name+14, "_block") ) return REDUCESCATTERBLOCK; + } + } + } + return -1; + } + if( 0 == strcmp(name, "scan") ) + return SCAN; + if( 0 == strcmp(name, "scatterv") ) + return SCATTERV; + if( 0 == strcmp(name, "scatter") ) + return SCATTER; + return -1; +} + +/* conversion table for all COLLTYPE_T values defined in ompi/mca/coll/base/coll_base_functions.h */ +static const char* colltype_translation_table[] = { + [ALLGATHER] = "allgather", + [ALLGATHERV] = "allgatherv", + [ALLREDUCE] = "allreduce", + [ALLTOALL] = "alltoall", + [ALLTOALLV] = "alltoallv", + [ALLTOALLW] = "alltoallw", + [BARRIER] = "barrier", + [BCAST] = "bcast", + [EXSCAN] = "exscan", + [GATHER] = "gather", + [GATHERV] = "gatherv", + [REDUCE] = "reduce", + [REDUCESCATTER] = "reduce_scatter", + [REDUCESCATTERBLOCK] = "reduce_scatter_block", + [SCAN] = "scan", + [SCATTER] = "scatter", + [SCATTERV] = "scatterv", + [NEIGHBOR_ALLGATHER] = "neighbor_allgather", + [NEIGHBOR_ALLGATHERV] = "neighbor_allgatherv", + [NEIGHBOR_ALLTOALL] = "neighbor_alltoall", + [NEIGHBOR_ALLTOALLV] = "neighbor_alltoallv", + [NEIGHBOR_ALLTOALLW] = "neighbor_alltoallw", + [COLLCOUNT] = NULL +}; + +const char* mca_coll_base_colltype_to_str(int collid) +{ + if( (collid < 0) || (collid >= COLLCOUNT) ) { + return NULL; + } + return colltype_translation_table[collid]; +} diff --git a/ompi/mca/coll/base/coll_base_util.h b/ompi/mca/coll/base/coll_base_util.h index 239322b022c..e20ed6652cc 100644 --- a/ompi/mca/coll/base/coll_base_util.h +++ b/ompi/mca/coll/base/coll_base_util.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -178,8 +178,17 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request, ompi_datatype_t * const rtypes[]); /* File reading function */ -#define MYEOF -999 -long ompi_coll_base_file_getnext(FILE *fptr, int *fileline); +int ompi_coll_base_file_getnext_long(FILE *fptr, int *fileline, long* val); +int ompi_coll_base_file_getnext_size_t(FILE *fptr, int *fileline, size_t* val); +int ompi_coll_base_file_getnext_string(FILE *fptr, int *fileline, char** val); +/* peek at the next valid token to see if it begins with the expected value. If yes + * eat the value, otherwise put it back into the file. + */ +int ompi_coll_base_file_peek_next_char_is(FILE *fptr, int *fileline, int expected); + +/* Miscelaneous function */ +const char* mca_coll_base_colltype_to_str(int collid); +int mca_coll_base_name_to_colltype(const char* name); END_C_DECLS #endif /* MCA_COLL_BASE_UTIL_EXPORT_H */ diff --git a/ompi/mca/coll/han/Makefile.am b/ompi/mca/coll/han/Makefile.am index 55892512e3b..61b40d97c51 100644 --- a/ompi/mca/coll/han/Makefile.am +++ b/ompi/mca/coll/han/Makefile.am @@ -26,8 +26,7 @@ coll_han_trigger.c \ coll_han_dynamic.c \ coll_han_dynamic_file.c \ coll_han_topo.c \ -coll_han_subcomms.c \ -coll_han_utils.c +coll_han_subcomms.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index 1af75ffec30..16efcbe8e5a 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -20,9 +20,7 @@ #include "opal/util/output.h" #include "ompi/mca/coll/base/coll_base_functions.h" #include "coll_han_trigger.h" -#include "ompi/mca/coll/han/coll_han_dynamic.h" - -BEGIN_C_DECLS +#include "ompi/mca/coll/han/coll_han_dynamic.h" /* * Today; @@ -33,131 +31,125 @@ BEGIN_C_DECLS #define COLL_HAN_LOW_MODULES 2 #define COLL_HAN_UP_MODULES 2 -typedef struct { - uint32_t umod; - uint32_t lmod; - uint32_t fs; - uint32_t ualg; - uint32_t us; -} selection; - -struct mca_bcast_argu_s { +struct mca_coll_han_bcast_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; void *buff; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; int root_low_rank; int root_up_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; }; -typedef struct mca_bcast_argu_s mca_bcast_argu_t; +typedef struct mca_coll_han_bcast_args_s mca_coll_han_bcast_args_t; -struct mca_reduce_argu_s { +struct mca_coll_han_reduce_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; void *sbuf; void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; int root_low_rank; int root_up_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; + bool is_tmp_rbuf; }; -typedef struct mca_reduce_argu_s mca_reduce_argu_t; +typedef struct mca_coll_han_reduce_args_s mca_coll_han_reduce_args_t; -struct mca_allreduce_argu_s { +struct mca_coll_han_allreduce_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *rbuf; + ompi_op_t *op; + ompi_datatype_t *dtype; int seg_count; - struct ompi_datatype_t *dtype; - struct ompi_op_t *op; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int num_segments; int cur_seg; int w_rank; int last_seg_count; bool noop; - ompi_request_t *req; int *completed; }; -typedef struct mca_allreduce_argu_s mca_allreduce_argu_t; +typedef struct mca_coll_han_allreduce_args_s mca_coll_han_allreduce_args_t; -struct mca_scatter_argu_s { +struct mca_coll_han_scatter_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; void *sbuf_reorder_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; - ompi_request_t *req; }; -typedef struct mca_scatter_argu_s mca_scatter_argu_t; +typedef struct mca_coll_han_scatter_args_s mca_coll_han_scatter_args_t; -struct mca_gather_argu_s { +struct mca_coll_han_gather_args_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root; int root_up_rank; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; - ompi_request_t *req; + bool is_mapbycore; }; -typedef struct mca_gather_argu_s mca_gather_argu_t; +typedef struct mca_coll_han_gather_args_s mca_coll_han_gather_args_t; -struct mca_allgather_argu_s { +struct mca_coll_han_allgather_s { mca_coll_task_t *cur_task; + ompi_communicator_t *up_comm; + ompi_communicator_t *low_comm; + ompi_request_t *req; void *sbuf; void *sbuf_inter_free; - int scount; - struct ompi_datatype_t *sdtype; void *rbuf; + ompi_datatype_t *sdtype; + ompi_datatype_t *rdtype; + int scount; int rcount; - struct ompi_datatype_t *rdtype; int root_low_rank; - struct ompi_communicator_t *up_comm; - struct ompi_communicator_t *low_comm; int w_rank; bool noop; bool is_mapbycore; int *topo; - ompi_request_t *req; }; -typedef struct mca_allgather_argu_s mca_allgather_argu_t; +typedef struct mca_coll_han_allgather_s mca_coll_han_allgather_t; /** * Structure to hold the han coll component. First it holds the @@ -184,7 +176,7 @@ typedef struct mca_coll_han_component_t { /* up level module for reduce */ uint32_t han_reduce_up_module; /* low level module for reduce */ - uint32_t han_reduce_low_module; + uint32_t han_reduce_low_module; /* segment size for allreduce */ uint32_t han_allreduce_segsize; /* up level module for allreduce */ @@ -203,21 +195,10 @@ typedef struct mca_coll_han_component_t { uint32_t han_scatter_up_module; /* low level module for scatter */ uint32_t han_scatter_low_module; - /* whether enable auto tune */ - uint32_t han_auto_tune; /* whether we need reproducible results * (but disables topological optimisations) */ uint32_t han_reproducible; - /* create a 3D array - * num_processes (n): 2 4 8 16 32 64 (6) - * num_core (c): 2 4 8 12 (4) - * message size (m): 1 - 4194304 (23) - */ - uint32_t han_auto_tune_n; - uint32_t han_auto_tune_c; - uint32_t han_auto_tune_m; - selection *han_auto_tuned; bool use_simple_algorithm[COLLCOUNT]; /* Dynamic configuration rules */ @@ -228,7 +209,6 @@ typedef struct mca_coll_han_component_t { mca_coll_han_dynamic_rules_t dynamic_rules; /* Dynamic rules from mca parameter */ COMPONENT_T mca_rules[COLLCOUNT][NB_TOPO_LVL]; - int topo_level; /* Define maximum dynamic errors printed by rank 0 with a 0 verbosity level */ int max_dynamic_errors; @@ -240,7 +220,7 @@ typedef void (*previous_dummy_fn_t) (void); * Structure used to store what is necessary for the collective operations * routines in case of fallback. */ -typedef struct collective_fallback_t { +typedef struct mca_coll_han_single_collective_fallback_s { union { mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_allgatherv_fn_t allgatherv; @@ -250,9 +230,24 @@ typedef struct collective_fallback_t { mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_scatter_fn_t scatter; previous_dummy_fn_t dummy; - } previous_routine; - mca_coll_base_module_t *previous_module; -} collective_fallback_t; + }; + mca_coll_base_module_t* module; +} mca_coll_han_single_collective_fallback_t; + +/* + * The structure containing a replacement for all collective supported + * by HAN. This structure is used as a fallback during subcommunicator + * creation. + */ +typedef struct mca_coll_han_collectives_fallback_s { + mca_coll_han_single_collective_fallback_t allgather; + mca_coll_han_single_collective_fallback_t allgatherv; + mca_coll_han_single_collective_fallback_t allreduce; + mca_coll_han_single_collective_fallback_t bcast; + mca_coll_han_single_collective_fallback_t reduce; + mca_coll_han_single_collective_fallback_t gather; + mca_coll_han_single_collective_fallback_t scatter; +} mca_coll_han_collectives_fallback_t; /** Coll han module */ typedef struct mca_coll_han_module_t { @@ -262,7 +257,6 @@ typedef struct mca_coll_han_module_t { /* Whether this module has been lazily initialized or not yet */ bool enabled; - struct ompi_communicator_t *cached_comm; struct ompi_communicator_t **cached_low_comms; struct ompi_communicator_t **cached_up_comms; int *cached_vranks; @@ -271,7 +265,7 @@ typedef struct mca_coll_han_module_t { bool are_ppn_imbalanced; /* To be able to fallback when the cases are not supported */ - struct collective_fallback_t previous_routines[COLLCOUNT]; + struct mca_coll_han_collectives_fallback_s fallback; /* To be able to fallback on reproducible algorithm */ mca_coll_base_module_reduce_fn_t reproducible_reduce; @@ -280,7 +274,7 @@ typedef struct mca_coll_han_module_t { mca_coll_base_module_t *reproducible_allreduce_module; /* Topological level of this communicator */ - int topologic_level; + TOPO_LVL_T topologic_level; /* Collective module storage for module choice */ mca_coll_han_collective_modules_storage_t modules_storage; @@ -302,21 +296,53 @@ OBJ_CLASS_DECLARATION(mca_coll_han_module_t); * Some defines to stick to the naming used in the other components in terms of * fallback routines */ -#define previous_allgather previous_routines[ALLGATHER].previous_routine.allgather -#define previous_allgatherv previous_routines[ALLGATHERV].previous_routine.allgatherv -#define previous_allreduce previous_routines[ALLREDUCE].previous_routine.allreduce -#define previous_bcast previous_routines[BCAST].previous_routine.bcast -#define previous_gather previous_routines[GATHER].previous_routine.gather -#define previous_reduce previous_routines[REDUCE].previous_routine.reduce -#define previous_scatter previous_routines[SCATTER].previous_routine.scatter - -#define previous_allgather_module previous_routines[ALLGATHER].previous_module -#define previous_allgatherv_module previous_routines[ALLGATHERV].previous_module -#define previous_allreduce_module previous_routines[ALLREDUCE].previous_module -#define previous_bcast_module previous_routines[BCAST].previous_module -#define previous_gather_module previous_routines[GATHER].previous_module -#define previous_reduce_module previous_routines[REDUCE].previous_module -#define previous_scatter_module previous_routines[SCATTER].previous_module +#define previous_allgather fallback.allgather.allgather +#define previous_allgather_module fallback.allgather.module + +#define previous_allgatherv fallback.allgatherv.allgatherv +#define previous_allgatherv_module fallback.allgatherv.module + +#define previous_allreduce fallback.allreduce.allreduce +#define previous_allreduce_module fallback.allreduce.module + +#define previous_bcast fallback.bcast.bcast +#define previous_bcast_module fallback.bcast.module + +#define previous_reduce fallback.reduce.reduce +#define previous_reduce_module fallback.reduce.module + +#define previous_gather fallback.gather.gather +#define previous_gather_module fallback.gather.module + +#define previous_scatter fallback.scatter.scatter +#define previous_scatter_module fallback.scatter.module + + +/* macro to correctly load a fallback collective module */ +#define HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, COLL) \ + do { \ + if ( ((COMM)->c_coll->coll_ ## COLL ## _module) == (mca_coll_base_module_t*)(HANM) ) { \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + mca_coll_base_module_t *coll_module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + OBJ_RETAIN((COMM)->c_coll->coll_ ## COLL ## _module); \ + OBJ_RELEASE(coll_module); \ + } \ + } while(0) + +/* macro to correctly load /all/ fallback collectives */ +#define HAN_LOAD_FALLBACK_COLLECTIVES(HANM, COMM) \ + do { \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, bcast); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, scatter); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, gather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, reduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allreduce); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgather); \ + HAN_LOAD_FALLBACK_COLLECTIVE(HANM, COMM, allgatherv); \ + han_module->enabled = false; /* entire module set to pass-through from now on */ \ + } while(0) + /** * Global component instance @@ -333,20 +359,30 @@ mca_coll_base_module_t *mca_coll_han_comm_query(struct ompi_communicator_t *comm int han_request_free(ompi_request_t ** request); /* Subcommunicator creation */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); -/* Gather topology information */ +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module); +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module); + +/** + * Gather topology information + * + * Returns a pointer to the (potentially already cached) topology. + * NOTE: if the rank distribution is imbalanced, no effort will be made to gather + * the topology at all ranks and instead NULL is returned and han_module->is_mapbycore + * is set to false. + * If HAN ever learns to deal with imbalanced topologies, this needs fixing! + */ int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, mca_coll_han_module_t * han_module, int num_topo_level); /* Utils */ -void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, - int *root_up_rank); -uint32_t han_auto_tuned_get_n(uint32_t n); -uint32_t han_auto_tuned_get_c(uint32_t c); -uint32_t han_auto_tuned_get_m(uint32_t m); +static inline void +mca_coll_han_get_ranks(int *vranks, int root, int low_size, + int *root_low_rank, int *root_up_rank) +{ + *root_up_rank = vranks[root] / low_size; + *root_low_rank = vranks[root] % low_size; +} -const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll); const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); /** Dynamic component choice */ @@ -356,7 +392,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl); */ int mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module); + mca_coll_han_module_t *han_module); int mca_coll_han_allgather_intra_dynamic(ALLGATHER_BASE_ARGS, @@ -382,22 +418,13 @@ mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS, /* Bcast */ int mca_coll_han_bcast_intra_simple(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); -void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, - int seg_count, struct ompi_datatype_t *dtype, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop); + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_han_bcast_intra(void *buff, int count, struct ompi_datatype_t *dtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_bcast_t0_task(void *task_argu); -int mca_coll_han_bcast_t1_task(void *task_argu); /* Reduce */ int @@ -422,145 +449,75 @@ mca_coll_han_reduce_reproducible(const void *sbuf, struct ompi_communicator_t *comm, mca_coll_base_module_t *module); - - -void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, int seg_count, struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop); - -int mca_coll_han_reduce_intra(const void *sbuf, +int mca_coll_han_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, ompi_op_t* op, int root, - struct ompi_communicator_t *comm, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_reduce_t0_task(void *task_argu); -int mca_coll_han_reduce_t1_task(void *task_argu); - /* Allreduce */ int mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); int mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, mca_coll_base_module_t *module); int mca_coll_han_allreduce_reproducible(const void *sbuf, void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); -void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, - int seg_count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, - int cur_seg, - int w_rank, - int last_seg_count, - bool noop, ompi_request_t * req, int *completed); int mca_coll_han_allreduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_allreduce_t0_task(void *task_argu); -int mca_coll_han_allreduce_t1_task(void *task_argu); -int mca_coll_han_allreduce_t2_task(void *task_argu); -int mca_coll_han_allreduce_t3_task(void *task_argu); /* Scatter */ int mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_scatter_us_task(void *task_argu); -int mca_coll_han_scatter_ls_task(void *task_argu); -void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - void *sbuf_reorder_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req); - -/* Gather */ -int -mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_gather_lg_task(void *task_argu); -int mca_coll_han_gather_ug_task(void *task_argu); -void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req); + +/* Gather */ +int +mca_coll_han_gather_intra(const void *sbuf, int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module); int mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); /* reordering after gather, for unordered ranks */ void ompi_coll_han_reorder_gather(const void *sbuf, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - int * topo); + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + int * topo); @@ -571,30 +528,12 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t * module); -int mca_coll_han_allgather_lg_task(void *task_argu); -int mca_coll_han_allgather_uag_task(void *task_argu); -int mca_coll_han_allgather_lb_task(void *task_argu); -void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, - bool noop, bool is_mapbycore, int *topo, ompi_request_t * req); int mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module); + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module); -END_C_DECLS #endif /* MCA_COLL_HAN_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_allgather.c b/ompi/mca/coll/han/coll_han_allgather.c index 50702d28ff9..cc7dfaff266 100644 --- a/ompi/mca/coll/han/coll_han_allgather.c +++ b/ompi/mca/coll/han/coll_han_allgather.c @@ -16,40 +16,45 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_allgather_argu(mca_allgather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, - bool noop, - bool is_mapbycore, - int *topo, - ompi_request_t * req) +static int mca_coll_han_allgather_lb_task(void *task_args); +static int mca_coll_han_allgather_lg_task(void *task_args); +static int mca_coll_han_allgather_uag_task(void *task_args); + +static inline void +mca_coll_han_set_allgather_args(mca_coll_han_allgather_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, + bool noop, + bool is_mapbycore, + int *topo, + ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->is_mapbycore = is_mapbycore; - argu->topo = topo; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->topo = topo; + args->req = req; } int @@ -60,44 +65,52 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int w_rank; - w_rank = ompi_comm_rank(comm); - /* Create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; int low_rank = ompi_comm_rank(low_comm); + int w_rank = ompi_comm_rank(comm); + + /* Init topo */ + int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* unbalanced case needs algo adaptation */ + if (han_module->are_ppn_imbalanced) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather with this communicator (imbalance). Fall back on another component\n")); + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } ompi_request_t *temp_request = NULL; /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; - - /* Init topo */ - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; int root_low_rank = 0; /* Create lg (lower level gather) task */ mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); /* Setup lg task arguments */ - mca_allgather_argu_t *lg_argu = malloc(sizeof(mca_allgather_argu_t)); - mac_coll_han_set_allgather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, + mca_coll_han_allgather_t *lg_args = malloc(sizeof(mca_coll_han_allgather_t)); + mca_coll_han_set_allgather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, rbuf, rcount, rdtype, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, topo, temp_request); - /* Init lg task */ - init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_argu)); - /* Issure lg task */ + /* Init and issue lg task */ + init_task(lg, mca_coll_han_allgather_lg_task, (void *) (lg_args)); issue_task(lg); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); @@ -105,48 +118,70 @@ mca_coll_han_allgather_intra(const void *sbuf, int scount, return OMPI_SUCCESS; } -/* lg: lower level (shared memory) gather task */ -int mca_coll_han_allgather_lg_task(void *task_argu) +/* lg: lower level gather task */ +int mca_coll_han_allgather_lg_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; + char *tmp_buf = NULL, *tmp_rbuf = NULL; + char *tmp_send = NULL; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: lg\n", t->w_rank)); - OBJ_RELEASE(t->cur_task); /* If the process is one of the node leader */ - char *tmp_buf = NULL; - char *tmp_rbuf = NULL; + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (t->rdtype, &rlb, &rext); + if (MPI_IN_PLACE == t->sbuf) { + t->sdtype = t->rdtype; + t->scount = t->rcount; + } if (!t->noop) { int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; + if (MPI_IN_PLACE == t->sbuf) { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + ompi_datatype_copy_content_same_ddt(t->rdtype, t->rcount, tmp_rbuf, tmp_send); + } + } + /* Lower level (shared memory or intra-node) gather */ + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_gather(MPI_IN_PLACE, t->scount, t->sdtype, + tmp_rbuf, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)t->rbuf) + (ptrdiff_t)t->w_rank * (ptrdiff_t)t->rcount * rext; + t->low_comm->c_coll->coll_gather(tmp_send, t->rcount, t->rdtype, + NULL, t->rcount, t->rdtype, t->root_low_rank, + t->low_comm, t->low_comm->c_coll->coll_gather_module); + } } - /* Shared memory gather */ - t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, - t->rdtype, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_gather_module); + else { + t->low_comm->c_coll->coll_gather((char *) t->sbuf, t->scount, t->sdtype, tmp_rbuf, t->rcount, + t->rdtype, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_gather_module); + } + t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; /* Create uag (upper level all-gather) task */ - mca_coll_task_t *uag = OBJ_NEW(mca_coll_task_t); - /* Setup uag task arguments */ - t->cur_task = uag; - /* Init uag task */ + mca_coll_task_t *uag = t->cur_task; + /* Init and issue uag task */ init_task(uag, mca_coll_han_allgather_uag_task, (void *) t); - /* Issure uag task */ issue_task(uag); return OMPI_SUCCESS; } /* uag: upper level (inter-node) all-gather task */ -int mca_coll_han_allgather_uag_task(void *task_argu) +int mca_coll_han_allgather_uag_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; - OBJ_RELEASE(t->cur_task); + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, @@ -213,21 +248,18 @@ int mca_coll_han_allgather_uag_task(void *task_argu) /* Create lb (low level broadcast) task */ - mca_coll_task_t *lb = OBJ_NEW(mca_coll_task_t); - /* Setup lb task arguments */ - t->cur_task = lb; - /* Init lb task */ + mca_coll_task_t *lb = t->cur_task; + /* Init and issue lb task */ init_task(lb, mca_coll_han_allgather_lb_task, (void *) t); - /* Issure lb task */ issue_task(lb); return OMPI_SUCCESS; } -/* lb: low level (shared-memory) broadcast task */ -int mca_coll_han_allgather_lb_task(void *task_argu) +/* lb: low level broadcast task */ +int mca_coll_han_allgather_lb_task(void *task_args) { - mca_allgather_argu_t *t = (mca_allgather_argu_t *) task_argu; + mca_coll_han_allgather_t *t = (mca_coll_han_allgather_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allgather: uag noop\n", t->w_rank)); OBJ_RELEASE(t->cur_task); @@ -246,30 +278,41 @@ int mca_coll_han_allgather_lb_task(void *task_argu) int mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void* rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module){ + struct ompi_datatype_t *sdtype, + void* rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module){ /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - mca_coll_han_comm_create_new(comm, han_module); - ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; - ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allgather within this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); + } /* discovery topology */ int *topo = mca_coll_han_topo_init(comm, han_module, 2); /* unbalanced case needs algo adaptation */ - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allgather with this communicator. It need to fall back on another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, - comm, han_module->previous_allgather_module); + "han cannot handle allgather within this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, allgather); + return comm->c_coll->coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, comm->c_coll->coll_allgather_module); } + ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; + ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + int w_rank = ompi_comm_rank(comm); /* setup up/low coordinates */ int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); @@ -279,27 +322,54 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, /* allocate the intermediary buffer * to gather on leaders on the low sub communicator */ + ptrdiff_t rlb, rext; + ompi_datatype_get_extent (rdtype, &rlb, &rext); char *tmp_buf = NULL; char *tmp_buf_start = NULL; + char *tmp_send = NULL; + if (MPI_IN_PLACE == sbuf) { + scount = rcount; + sdtype = rdtype; + } if (low_rank == root_low_rank) { ptrdiff_t rsize, rgap = 0; /* Compute the size to receive all the local data, including datatypes empty gaps */ rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size, &rgap); - // intermediary buffer on node leaders to gather on low comm + /* intermediary buffer on node leaders to gather on low comm */ tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; + if (MPI_IN_PLACE == sbuf) { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + ompi_datatype_copy_content_same_ddt(rdtype, rcount, tmp_buf_start, tmp_send); + } } /* 1. low gather on node leaders into tmp_buf */ - low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, - tmp_buf_start, rcount, rdtype, root_low_rank, - low_comm, low_comm->c_coll->coll_gather_module); + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + low_comm->c_coll->coll_gather(MPI_IN_PLACE, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + else { + tmp_send = ((char*)rbuf) + (ptrdiff_t)w_rank * (ptrdiff_t)rcount * rext; + low_comm->c_coll->coll_gather(tmp_send, rcount, rdtype, + NULL, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } + } + else { + low_comm->c_coll->coll_gather((char *)sbuf, scount, sdtype, + tmp_buf_start, rcount, rdtype, root_low_rank, + low_comm, low_comm->c_coll->coll_gather_module); + } /* 2. allgather between node leaders, from tmp_buf to reorder_buf */ if (low_rank == root_low_rank) { /* allocate buffer to store unordered result on node leaders - * * if the processes are mapped-by core, no need to reorder: - * * distribution of ranks on core first and node next, - * * in a increasing order for both patterns */ + * if the processes are mapped-by core, no need to reorder: + * distribution of ranks on core first and node next, + * in a increasing order for both patterns. + */ char *reorder_buf = NULL; char *reorder_buf_start = NULL; if (han_module->is_mapbycore) { @@ -307,7 +377,7 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, } else { if (0 == low_rank && 0 == up_rank) { // first rank displays message OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Allgather needs reordering: ", w_rank)); + "[%d]: Future Allgather needs reordering: ", up_rank)); } ptrdiff_t rsize, rgap = 0; rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * low_size * up_size, &rgap); @@ -332,8 +402,8 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, */ if (!han_module->is_mapbycore) { ompi_coll_han_reorder_gather(reorder_buf_start, - rbuf, rcount, rdtype, - comm, topo); + rbuf, rcount, rdtype, + comm, topo); free(reorder_buf); reorder_buf = NULL; } @@ -347,4 +417,4 @@ mca_coll_han_allgather_intra_simple(const void *sbuf, int scount, return OMPI_SUCCESS; - } +} diff --git a/ompi/mca/coll/han/coll_han_allreduce.c b/ompi/mca/coll/han/coll_han_allreduce.c index 6a4fd6038f7..afa0e0a220e 100644 --- a/ompi/mca/coll/han/coll_han_allreduce.c +++ b/ompi/mca/coll/han/coll_han_allreduce.c @@ -17,46 +17,52 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_allreduce_t0_task(void *task_args); +static int mca_coll_han_allreduce_t1_task(void *task_args); +static int mca_coll_han_allreduce_t2_task(void *task_args); +static int mca_coll_han_allreduce_t3_task(void *task_args); + /* Only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_allreduce_argu(mca_allreduce_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *rbuf, - int seg_count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, - int cur_seg, - int w_rank, - int last_seg_count, - bool noop, ompi_request_t * req, int *completed) +static inline void +mca_coll_han_set_allreduce_args(mca_coll_han_allreduce_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *rbuf, + int seg_count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, + int cur_seg, + int w_rank, + int last_seg_count, + bool noop, ompi_request_t * req, int *completed) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->rbuf = rbuf; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->op = op; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; - argu->req = req; - argu->completed = completed; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->req = req; + args->completed = completed; } -/* - * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: +/* + * Each segment of the messsage needs to go though 4 steps to perform MPI_Allreduce: * lr: lower level (shared-memory or intra-node) reduce, * ur: upper level (inter-node) reduce, * ub: upper level (inter-node) bcast, @@ -80,72 +86,40 @@ mca_coll_han_allreduce_intra(const void *sbuf, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - // Fallback to another component if the op cannot commute mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - if (! ompi_op_is_commute(op)) { + + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); + goto prev_allreduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle allreduce with this communicator." - "It need to fall back on another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, - comm, han_module->previous_allreduce_module); + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); } - ptrdiff_t extent, lb; + size_t dtype_size; ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; + int seg_count = count, w_rank; w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); + ompi_datatype_type_size(dtype, &dtype_size); - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; - /* Auto tune is enabled */ - if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { - uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); - uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); - uint32_t m = han_auto_tuned_get_m(typelng * count); - uint32_t id = - n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + - c * mca_coll_han_component.han_auto_tune_m + m + - mca_coll_han_component.han_auto_tune_n * mca_coll_han_component.han_auto_tune_c * - mca_coll_han_component.han_auto_tune_m; - uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; - uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; - uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; - /* ualg and us are only available when using ADAPT */ - /* - uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; - uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; - */ - /* Set up umod */ - up_comm = han_module->cached_up_comms[umod]; - /* Set up lmod */ - low_comm = han_module->cached_low_comms[lmod]; - /* Set up fs */ - COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); - /* Set up ualg and us, which is only available when using ADAPT */ - /* - if (umod == 1) { - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - } - */ - } else { - low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; - up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, typelng, - seg_count); - } + + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_allreduce_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_allreduce_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_allreduce_segsize, dtype_size, + seg_count); /* Determine number of elements sent per task. */ OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, @@ -161,8 +135,8 @@ mca_coll_han_allreduce_intra(const void *sbuf, /* Setup up t0 task arguments */ int *completed = (int *) malloc(sizeof(int)); completed[0] = 0; - mca_allreduce_argu_t *t = malloc(sizeof(mca_allreduce_argu_t)); - mac_coll_han_set_allreduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, + mca_coll_han_allreduce_args_t *t = malloc(sizeof(mca_coll_han_allreduce_args_t)); + mca_coll_han_set_allreduce_args(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank, NULL, completed); @@ -208,35 +182,51 @@ mca_coll_han_allreduce_intra(const void *sbuf, init_task(t3, mca_coll_han_allreduce_t3_task, (void *) t); issue_task(t3); } - if (t->completed != NULL) { - free(t->completed); - t->completed = NULL; - } + free(t->completed); + t->completed = NULL; free(t); return OMPI_SUCCESS; + + prev_allreduce_intra: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); } /* t0 task */ -int mca_coll_han_allreduce_t0_task(void *task_argu) +int mca_coll_han_allreduce_t0_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t0 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); OBJ_RELEASE(t->cur_task); ptrdiff_t extent, lb; ompi_datatype_get_extent(t->dtype, &lb, &extent); - t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, - t->op, t->root_low_rank, t->low_comm, - t->low_comm->c_coll->coll_reduce_module); + if (MPI_IN_PLACE == t->sbuf) { + if (!t->noop) { + t->low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->rbuf, NULL, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } + } + else { + t->low_comm->c_coll->coll_reduce((char *) t->sbuf, (char *) t->rbuf, t->seg_count, t->dtype, + t->op, t->root_low_rank, t->low_comm, + t->low_comm->c_coll->coll_reduce_module); + } return OMPI_SUCCESS; } /* t1 task */ -int mca_coll_han_allreduce_t1_task(void *task_argu) +int mca_coll_han_allreduce_t1_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t1 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -270,16 +260,16 @@ int mca_coll_han_allreduce_t1_task(void *task_argu) } if (!t->noop) { - ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; } /* t2 task */ -int mca_coll_han_allreduce_t2_task(void *task_argu) +int mca_coll_han_allreduce_t2_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t2 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -336,9 +326,9 @@ int mca_coll_han_allreduce_t2_task(void *task_argu) } /* t3 task */ -int mca_coll_han_allreduce_t3_task(void *task_argu) +int mca_coll_han_allreduce_t3_task(void *task_args) { - mca_allreduce_argu_t *t = (mca_allreduce_argu_t *) task_argu; + mca_coll_han_allreduce_args_t *t = (mca_coll_han_allreduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] HAN Allreduce: t3 %d r_buf %d\n", t->w_rank, t->cur_seg, ((int *) t->rbuf)[0])); @@ -408,12 +398,12 @@ int mca_coll_han_allreduce_t3_task(void *task_argu) int mca_coll_han_allreduce_intra_simple(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; @@ -428,22 +418,43 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, // Fallback to another component if the op cannot commute if (! ompi_op_is_commute(op)) { - OPAL_OUTPUT_VERBOSE((30, cs->han_output, - "han cannot handle allreduce with this operation." - "It need to fall back on another component\n")); + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this operation. Fall back on another component\n")); goto prev_allreduce; } - mca_coll_han_comm_create_new(comm, han_module); + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle allreduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_allreduce(sbuf, rbuf, count, dtype, op, + comm, comm->c_coll->coll_reduce_module); + } low_comm = han_module->sub_comm[INTRA_NODE]; up_comm = han_module->sub_comm[INTER_NODE]; low_rank = ompi_comm_rank(low_comm); /* Low_comm reduce */ - ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + if (MPI_IN_PLACE == sbuf) { + if (low_rank == root_low_rank) { + ret = low_comm->c_coll->coll_reduce(MPI_IN_PLACE, (char *)rbuf, count, dtype, op, root_low_rank, low_comm, low_comm->c_coll->coll_reduce_module); + } + else { + ret = low_comm->c_coll->coll_reduce((char *)rbuf, NULL, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } + } + else { + ret = low_comm->c_coll->coll_reduce((char *)sbuf, (char *)rbuf, + count, dtype, op, root_low_rank, + low_comm, low_comm->c_coll->coll_reduce_module); + } if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((30, cs->han_output, "HAN/ALLREDUCE: low comm reduce failed. " @@ -480,9 +491,9 @@ mca_coll_han_allreduce_intra_simple(const void *sbuf, return OMPI_SUCCESS; -prev_allreduce: - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, comm, - han_module->previous_allreduce_module); + prev_allreduce: + return han_module->previous_allreduce(sbuf, rbuf, count, dtype, op, + comm, han_module->previous_allreduce_module); } /* Find a fallback on reproducible algorithm @@ -504,15 +515,14 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, int i; for (i=0; imodules_storage - .modules[fallback] - .module_handler; + mca_coll_base_module_t *fallback_module + = han_module->modules_storage.modules[fallback].module_handler; if (NULL != fallback_module && NULL != fallback_module->coll_allreduce) { if (0 == w_rank) { opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:allreduce_reproducible: " "fallback on %s\n", - components_name[fallback]); + available_components[fallback].component_name); } han_module->reproducible_allreduce_module = fallback_module; han_module->reproducible_allreduce = fallback_module->coll_allreduce; @@ -525,8 +535,7 @@ mca_coll_han_allreduce_reproducible_decision(struct ompi_communicator_t *comm, "coll:han:allreduce_reproducible_decision: " "no reproducible fallback\n"); } - han_module->reproducible_allreduce_module = - han_module->previous_allreduce_module; + han_module->reproducible_allreduce_module = han_module->previous_allreduce_module; han_module->reproducible_allreduce = han_module->previous_allreduce; return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_bcast.c b/ompi/mca/coll/han/coll_han_bcast.c index 6eebc3b7d38..c32ea745b03 100644 --- a/ompi/mca/coll/han/coll_han_bcast.c +++ b/ompi/mca/coll/han/coll_han_bcast.c @@ -16,31 +16,35 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_bcast_argu(mca_bcast_argu_t * argu, mca_coll_task_t * cur_task, void *buff, - int seg_count, struct ompi_datatype_t *dtype, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) +static int mca_coll_han_bcast_t0_task(void *task_args); +static int mca_coll_han_bcast_t1_task(void *task_args); + +static inline void +mca_coll_han_set_bcast_args(mca_coll_han_bcast_args_t * args, mca_coll_task_t * cur_task, void *buff, + int seg_count, struct ompi_datatype_t *dtype, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop) { - argu->cur_task = cur_task; - argu->buff = buff; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->root_low_rank = root_low_rank; - argu->root_up_rank = root_up_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; + args->cur_task = cur_task; + args->buff = buff; + args->seg_count = seg_count; + args->dtype = dtype; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; } -/* - * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Bcast: * ub: upper level (inter-node) bcast * lb: low level (shared-memory or intra-node) bcast. * Hence, in each iteration, there is a combination of collective operations which is called a task. @@ -58,82 +62,57 @@ mca_coll_han_bcast_intra(void *buff, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int err, seg_count = count, w_rank = ompi_comm_rank(comm); + ompi_communicator_t *low_comm, *up_comm; + ptrdiff_t extent, lb; + size_t dtype_size; + /* Create the subcommunicators */ + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); } - ompi_datatype_type_size(dtype, &typelng); - - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); - ompi_communicator_t *low_comm; - ompi_communicator_t *up_comm; - /* Auto tune is enabled */ - if (mca_coll_han_component.han_auto_tune && mca_coll_han_component.han_auto_tuned != NULL) { - uint32_t n = han_auto_tuned_get_n(ompi_comm_size(han_module->cached_up_comms[0])); - uint32_t c = han_auto_tuned_get_c(ompi_comm_size(han_module->cached_low_comms[0])); - uint32_t m = han_auto_tuned_get_m(typelng * count); - uint32_t id = - n * mca_coll_han_component.han_auto_tune_c * mca_coll_han_component.han_auto_tune_m + - c * mca_coll_han_component.han_auto_tune_m + m; - uint32_t umod = mca_coll_han_component.han_auto_tuned[id].umod; - uint32_t lmod = mca_coll_han_component.han_auto_tuned[id].lmod; - uint32_t fs = mca_coll_han_component.han_auto_tuned[id].fs; - /* ualg and us are only available when using ADAPT */ - /* - uint32_t ualg = mca_coll_han_component.han_auto_tuned[id].ualg; - uint32_t us = mca_coll_han_component.han_auto_tuned[id].us; - */ - /* Set up umod */ - up_comm = han_module->cached_up_comms[umod]; - /* Set up lmod */ - low_comm = han_module->cached_low_comms[lmod]; - /* Set up fs */ - COLL_BASE_COMPUTED_SEGCOUNT((size_t) fs, typelng, seg_count); - /* Set up ualg and us, which is only available when using ADAPT */ - /* - if (umod == 1) { - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_algorithm = ualg; - ((mca_coll_adapt_module_t *) (up_comm->c_coll->coll_ibcast_module))->adapt_component-> - adapt_ibcast_segment_size = us; - } - */ + ompi_datatype_get_extent(dtype, &lb, &extent); + ompi_datatype_type_size(dtype, &dtype_size); - } else { - /* If auto tune is disabled, use MCA parameters */ - low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; - up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, typelng, - seg_count); - } + /* use MCA parameters for now */ + low_comm = han_module->cached_low_comms[mca_coll_han_component.han_bcast_low_module]; + up_comm = han_module->cached_up_comms[mca_coll_han_component.han_bcast_up_module]; + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_bcast_segsize, dtype_size, + seg_count); int num_segments = (count + seg_count - 1) / seg_count; OPAL_OUTPUT_VERBOSE((20, mca_coll_han_component.han_output, - "In HAN seg_count %d count %d num_seg %d\n", + "In HAN seg_count %d count %d num_seg %d\n", seg_count, count, num_segments)); int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); - int root_low_rank; - int root_up_rank; + int root_low_rank, root_up_rank; mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, @@ -142,8 +121,8 @@ mca_coll_han_bcast_intra(void *buff, /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ - mca_bcast_argu_t *t = malloc(sizeof(mca_bcast_argu_t)); - mac_coll_han_set_bcast_argu(t, t0, (char *) buff, seg_count, dtype, + mca_coll_han_bcast_args_t *t = malloc(sizeof(mca_coll_han_bcast_args_t)); + mca_coll_han_set_bcast_args(t, t0, (char *) buff, seg_count, dtype, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, low_rank != root_low_rank); @@ -161,9 +140,7 @@ mca_coll_han_bcast_intra(void *buff, while (t->cur_seg <= t->num_segments - 2) { /* Create t1 task */ - mca_coll_task_t *t1 = OBJ_NEW(mca_coll_task_t); - /* Setup up t1 task arguments */ - t->cur_task = t1; + t->cur_task = t1 = OBJ_NEW(mca_coll_task_t); t->buff = (char *) t->buff + extent * seg_count; t->cur_seg = t->cur_seg + 1; /* Init the t1 task */ @@ -177,43 +154,40 @@ mca_coll_han_bcast_intra(void *buff, } /* t0 task: issue and wait for the upper level ibcast of segment 0 */ -int mca_coll_han_bcast_t0_task(void *task_argu) +int mca_coll_han_bcast_t0_task(void *task_args) { - mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); if (t->noop) { return OMPI_SUCCESS; - } else { - ptrdiff_t extent, lb; - ompi_datatype_get_extent(t->dtype, &lb, &extent); - ompi_request_t *ibcast_req; - t->up_comm->c_coll->coll_ibcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, - t->up_comm, &ibcast_req, t->up_comm->c_coll->coll_ibcast_module); - ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); - return OMPI_SUCCESS; } + t->up_comm->c_coll->coll_bcast((char *) t->buff, t->seg_count, t->dtype, t->root_up_rank, + t->up_comm, t->up_comm->c_coll->coll_bcast_module); + return OMPI_SUCCESS; } -/* t1 task: +/* t1 task: * 1. issue the upper level ibcast of segment cur_seg + 1 * 2. issue the low level bcast of segment cur_seg * 3. wait for the completion of the ibcast */ -int mca_coll_han_bcast_t1_task(void *task_argu) +int mca_coll_han_bcast_t1_task(void *task_args) { - mca_bcast_argu_t *t = (mca_bcast_argu_t *) task_argu; + mca_coll_han_bcast_args_t *t = (mca_coll_han_bcast_args_t *) task_args; + ompi_request_t *ibcast_req = NULL; + int tmp_count = t->seg_count; + ptrdiff_t extent, lb; + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); - ptrdiff_t extent, lb; ompi_datatype_get_extent(t->dtype, &lb, &extent); - ompi_request_t *ibcast_req = NULL; - int tmp_count = t->seg_count; if (!t->noop) { if (t->cur_seg <= t->num_segments - 2 ) { - if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + if (t->cur_seg == t->num_segments - 2) { tmp_count = t->last_seg_count; } t->up_comm->c_coll->coll_ibcast((char *) t->buff + extent * t->seg_count, @@ -223,12 +197,14 @@ int mca_coll_han_bcast_t1_task(void *task_argu) } } + /* are we the last segment to be pushed downstream ? */ + tmp_count = (t->cur_seg == (t->num_segments - 1)) ? t->last_seg_count : t->seg_count; t->low_comm->c_coll->coll_bcast((char *) t->buff, - t->seg_count, t->dtype, t->root_low_rank, t->low_comm, + tmp_count, t->dtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_bcast_module); - if (!t->noop && ibcast_req != NULL) { - ompi_request_wait(&ibcast_req, MPI_STATUSES_IGNORE); + if (NULL != ibcast_req) { + ompi_request_wait(&ibcast_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; @@ -242,51 +218,64 @@ mca_coll_han_bcast_intra_simple(void *buff, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int w_rank; - w_rank = ompi_comm_rank(comm); - /* create the subcommunicators */ mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - mca_coll_han_comm_create_new(comm, han_module); - ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; - ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; - - int *vranks = han_module->cached_vranks; - int low_rank = ompi_comm_rank(low_comm); - int low_size = ompi_comm_size(low_comm); - int root_low_rank; - int root_up_rank; + ompi_communicator_t *low_comm, *up_comm; + int err, w_rank = ompi_comm_rank(comm); + /* Create the subcommunicators */ + err = mca_coll_han_comm_create_new(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle bcast with this communicator. Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); + } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle bcast with this communicator. It need to fall back on another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, - comm, han_module->previous_bcast_module); - } else { - OPAL_OUTPUT_VERBOSE((10, mca_coll_han_component.han_output, - "[OMPI][han] in mca_coll_han_bcast_intra_simple\n")); + "han cannot handle bcast with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, bcast); + return comm->c_coll->coll_bcast(buff, count, dtype, root, + comm, comm->c_coll->coll_bcast_module); } + low_comm = han_module->sub_comm[INTRA_NODE]; + up_comm = han_module->sub_comm[INTER_NODE]; + + int *vranks = han_module->cached_vranks; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); + int root_low_rank, root_up_rank; + mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: root_low_rank %d root_up_rank %d\n", + "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, root_up_rank)); if (low_rank == root_low_rank) { - up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, up_comm, up_comm->c_coll->coll_bcast_module); + up_comm->c_coll->coll_bcast(buff, count, dtype, root_up_rank, + up_comm, up_comm->c_coll->coll_bcast_module); /* To remove when han has better sub-module selection. For now switching to ibcast enables to make runs with libnbc. */ //ompi_request_t req; - //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, up_comm, &req, up_comm->c_coll->coll_ibcast_module); + //up_comm->c_coll->coll_ibcast(buff, count, dtype, root_up_rank, + // up_comm, &req, up_comm->c_coll->coll_ibcast_module); //ompi_request_wait(&req, MPI_STATUS_IGNORE); } - low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, low_comm, low_comm->c_coll->coll_bcast_module); + low_comm->c_coll->coll_bcast(buff, count, dtype, root_low_rank, + low_comm, low_comm->c_coll->coll_bcast_module); return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index cfb40c7da02..ef55a6ac99d 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -25,13 +25,24 @@ #include "coll_han.h" #include "coll_han_dynamic.h" #include "coll_han_dynamic_file.h" +#include "ompi/mca/coll/base/coll_base_util.h" /* * Public string showing the coll ompi_han component version number */ const char *mca_coll_han_component_version_string = - "Open MPI han collective MCA component version " OMPI_VERSION; - + "Open MPI HAN collective MCA component version " OMPI_VERSION; + +ompi_coll_han_components available_components[COMPONENTS_COUNT] = { + { SELF, "self", NULL }, + { BASIC, "basic", NULL }, + { LIBNBC, "libnbc", NULL }, + { TUNED, "tuned", NULL }, + { SM, "sm", NULL }, + { SHARED, "shared", NULL }, + { ADAPT, "adapt", NULL }, + { HAN, "han", NULL } +}; /* * Local functions @@ -46,35 +57,33 @@ static int han_register(void); */ mca_coll_han_component_t mca_coll_han_component = { - /* First, fill in the super */ - { - /* First, the mca_component_t struct containing meta - information about the component itself */ + /* First, the mca_component_t struct containing meta + information about the component itself */ - .collm_version = { - MCA_COLL_BASE_VERSION_2_0_0, + .collm_version = { + MCA_COLL_BASE_VERSION_2_0_0, - /* Component name and version */ - .mca_component_name = "han", - MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, - OMPI_RELEASE_VERSION), + /* Component name and version */ + .mca_component_name = "han", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION), - /* Component functions */ - .mca_open_component = han_open, - .mca_close_component = han_close, - .mca_register_component_params = han_register, - }, - .collm_data = { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE}, + /* Component functions */ + .mca_open_component = han_open, + .mca_close_component = han_close, + .mca_register_component_params = han_register, + }, + .collm_data = { + /* The component is not checkpoint ready */ + MCA_BASE_METADATA_PARAM_NONE}, - /* Initialization / querying functions */ + /* Initialization / querying functions */ - .collm_init_query = mca_coll_han_init_query, - .collm_comm_query = mca_coll_han_comm_query, - }, + .collm_init_query = mca_coll_han_init_query, + .collm_comm_query = mca_coll_han_comm_query, + }, /* han-component specifc information */ @@ -87,27 +96,9 @@ mca_coll_han_component_t mca_coll_han_component = { */ static int han_open(void) { - int param; - mca_coll_han_component_t *cs = &mca_coll_han_component; - if (cs->han_auto_tune) { - cs->han_auto_tuned = - (selection *) malloc(2 * cs->han_auto_tune_n * cs->han_auto_tune_c * - cs->han_auto_tune_m * sizeof(selection)); - char *filename = "/home/dycz0fx/results/auto/auto_tuned.bin"; - FILE *file = fopen(filename, "r"); - fread(cs->han_auto_tuned, sizeof(selection), - 2 * cs->han_auto_tune_n * cs->han_auto_tune_c * cs->han_auto_tune_m, file); - fclose(file); - } + /* Get the global coll verbosity: it will be ours */ + mca_coll_han_component.han_output = ompi_coll_base_framework.framework_output; - /* - * Get the global coll verbosity: it will be ours - */ - cs->han_output = ompi_coll_base_framework.framework_output; - opal_output_verbose(1, cs->han_output, - "coll:han:component_open: done!"); - - cs->topo_level = GLOBAL_COMMUNICATOR; return mca_coll_han_init_dynamic_rules(); } @@ -117,11 +108,6 @@ static int han_open(void) */ static int han_close(void) { - mca_coll_han_component_t *cs = &mca_coll_han_component; - if (cs->han_auto_tune && cs->han_auto_tuned != NULL) { - free(cs->han_auto_tuned); - cs->han_auto_tuned = NULL; - } mca_coll_han_free_dynamic_rules(); return OMPI_SUCCESS; } @@ -154,57 +140,7 @@ const char* mca_coll_han_topo_lvl_to_str(TOPO_LVL_T topo_lvl) return "invalid topologic level"; } } -const char* mca_coll_han_colltype_to_str(COLLTYPE_T coll) -{ - switch(coll) { - case ALLGATHER: - return "allgather"; - case ALLGATHERV: - return "allgatherv"; - case ALLREDUCE: - return "allreduce"; - case ALLTOALL: - return "alltoall"; - case ALLTOALLV: - return "alltoallv"; - case ALLTOALLW: - return "alltoallw"; - case BARRIER: - return "barrier"; - case BCAST: - return "bcast"; - case EXSCAN: - return "exscan"; - case GATHER: - return "gather"; - case GATHERV: - return "gatherv"; - case REDUCE: - return "reduce"; - case REDUCESCATTER: - return "reduce_scatter"; - case REDUCESCATTERBLOCK: - return "reduce_scatter_block"; - case SCAN: - return "scan"; - case SCATTER: - return "scatter"; - case SCATTERV: - return "scatterv"; - case NEIGHBOR_ALLGATHER: - return "neighbor_allgather"; - case NEIGHBOR_ALLGATHERV: - return "neighbor_allgatherv"; - case NEIGHBOR_ALLTOALL: - return "neighbor_alltoall"; - case NEIGHBOR_ALLTOALLV: - return "neighbor_alltoallv"; - case NEIGHBOR_ALLTOALLW: - return "neighbor_alltoallw"; - default: - return ""; - } -} + /* * Register MCA params @@ -215,15 +151,14 @@ static int han_register(void) mca_coll_han_component_t *cs = &mca_coll_han_component; /* Generated parameters name and description */ - char param_name[100] = ""; - char param_desc[300] = ""; + char param_name[128], param_desc[256]; int param_desc_size; COLLTYPE_T coll; TOPO_LVL_T topo_lvl; COMPONENT_T component; cs->han_priority = 0; - (void) mca_base_component_var_register(c, "priority", "Priority of the han coll component", + (void) mca_base_component_var_register(c, "priority", "Priority of the HAN coll component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_priority); @@ -261,16 +196,14 @@ static int han_register(void) "up level module for allreduce, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reduce_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_up_module); cs->han_reduce_low_module = 0; (void) mca_base_component_var_register(c, "reduce_low_module", "low level module for allreduce, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reduce_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module); cs->han_allreduce_segsize = 524288; (void) mca_base_component_var_register(c, "allreduce_segsize", "segment size for allreduce", @@ -283,32 +216,28 @@ static int han_register(void) "up level module for allreduce, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allreduce_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_up_module); cs->han_allreduce_low_module = 0; (void) mca_base_component_var_register(c, "allreduce_low_module", "low level module for allreduce, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allreduce_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module); cs->han_allgather_up_module = 0; (void) mca_base_component_var_register(c, "allgather_up_module", "up level module for allgather, 0 libnbc, 1 adapt", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allgather_up_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_up_module); cs->han_allgather_low_module = 0; (void) mca_base_component_var_register(c, "allgather_low_module", "low level module for allgather, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_allgather_low_module); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module); cs->han_gather_up_module = 0; (void) mca_base_component_var_register(c, "gather_up_module", @@ -336,15 +265,7 @@ static int han_register(void) "low level module for scatter, 0 sm, 1 shared", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_scatter_low_module); - - cs->han_auto_tune = 0; - (void) mca_base_component_var_register(c, "auto_tune", - "whether enable auto tune, 0 disable, 1 enable, default 0", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune); + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module); cs->han_reproducible = 0; (void) mca_base_component_var_register(c, "reproducible", @@ -353,17 +274,15 @@ static int han_register(void) "0 disable 1 enable, default 0", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_reproducible); - + MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reproducible); /* Simple algorithms MCA parameters */ for(coll = 0 ; coll < COLLCOUNT ; coll++) { cs->use_simple_algorithm[coll] = false; if(is_simple_implemented(coll)) { - snprintf(param_name, 100, "use_simple_%s", - mca_coll_han_colltype_to_str(coll)); - snprintf(param_desc, 300, "whether to enable simple algo for %s", - mca_coll_han_colltype_to_str(coll)); + snprintf(param_name, sizeof(param_name), "use_simple_%s", + mca_coll_base_colltype_to_str(coll)); + snprintf(param_desc, sizeof(param_desc), "whether to enable simple algo for %s", + mca_coll_base_colltype_to_str(coll)); mca_base_component_var_register(c, param_name, param_desc, MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, @@ -374,31 +293,28 @@ static int han_register(void) } /* Dynamic rules MCA parameters */ - /* TODO: Find a way to avoid unused entried */ memset(cs->mca_rules, 0, COLLCOUNT * (GLOBAL_COMMUNICATOR+1) * sizeof(COMPONENT_T)); - for(coll = 0 ; coll < COLLCOUNT ; coll++) { + for(coll = 0; coll < COLLCOUNT; coll++) { if(!mca_coll_han_is_coll_dynamic_implemented(coll)) { continue; } /* * Default values - * Do not avoid to set correct default parameters */ cs->mca_rules[coll][INTRA_NODE] = TUNED; cs->mca_rules[coll][INTER_NODE] = BASIC; cs->mca_rules[coll][GLOBAL_COMMUNICATOR] = HAN; - for(topo_lvl = 0 ; topo_lvl < NB_TOPO_LVL ; topo_lvl++) { + for(topo_lvl = 0; topo_lvl < NB_TOPO_LVL; topo_lvl++) { - snprintf(param_name, 100, "%s_dynamic_%s_module", - mca_coll_han_colltype_to_str(coll), + snprintf(param_name, sizeof(param_name), "%s_dynamic_%s_module", + mca_coll_base_colltype_to_str(coll), mca_coll_han_topo_lvl_to_str(topo_lvl)); - param_desc_size = snprintf(param_desc, 300, - "Collective module to use for " - "collective %s on %s topological level: ", - mca_coll_han_colltype_to_str(coll), + param_desc_size = snprintf(param_desc, sizeof(param_desc), + "Collective module to use for %s on %s topological level: ", + mca_coll_base_colltype_to_str(coll), mca_coll_han_topo_lvl_to_str(topo_lvl)); /* * Exhaustive description: @@ -410,10 +326,10 @@ static int han_register(void) /* Han can only be used on the global communicator */ continue; } - param_desc_size += snprintf(param_desc+param_desc_size, 300, + param_desc_size += snprintf(param_desc+param_desc_size, sizeof(param_desc) - param_desc_size, "%d = %s; ", component, - components_name[component]); + available_components[component].component_name); } mca_base_component_var_register(c, param_name, param_desc, @@ -424,45 +340,11 @@ static int han_register(void) } } - /* - * TODO: remove the following lines when auto-tune is added back to the code - */ - cs->han_auto_tune = 0; - - cs->han_auto_tune_n = 5; - cs->han_auto_tune_c = 3; - cs->han_auto_tune_m = 21; -#if 0 - cs->han_auto_tune_n = 5; - (void) mca_base_component_var_register(c, "auto_tune_n", - "auto tune n", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_n); - - cs->han_auto_tune_c = 3; - (void) mca_base_component_var_register(c, "auto_tune_c", - "auto tune c", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &cs->han_auto_tune_c); - - cs->han_auto_tune_m = 21; - (void) mca_base_component_var_register(c, "auto_tune_m", - "auto tune n", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &cs->han_auto_tune_m); -#endif - /* Dynamic rules */ cs->use_dynamic_file_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "use_dynamic_file_rules", - "Switch used to decide if we use " - "dynamic module choice rules " - "defines by file", + "Enable the dynamic selection provided via the dynamic_rules_filename MCA", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -471,8 +353,7 @@ static int han_register(void) cs->dynamic_rules_filename = NULL; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dynamic_rules_filename", - "Filename of configuration file that " - "contains the dynamic module choice rules", + "Configuration file containing the dynamic selection rules", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -481,9 +362,7 @@ static int han_register(void) cs->dump_dynamic_rules = false; (void) mca_base_component_var_register(&mca_coll_han_component.super.collm_version, "dump_dynamic_rules", - "Switch used to decide if we dump " - "dynamic rules provided by " - "configuration file", + "Switch used to decide if we dump dynamic rules provided by configuration file", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, @@ -492,11 +371,8 @@ static int han_register(void) if((cs->dump_dynamic_rules || NULL != cs->dynamic_rules_filename) && !cs->use_dynamic_file_rules) { opal_output_verbose(0, cs->han_output, - "coll:han:han_register " - "you asked for dynamic rules " - "but they are not activated. " - "Check coll_han_use_dynamic_file_rules " - "MCA parameter"); + "HAN: dynamic rules for collectives are hot activated." + "Check coll_han_use_dynamic_file_rules MCA parameter"); } cs->max_dynamic_errors = 10; diff --git a/ompi/mca/coll/han/coll_han_dynamic.c b/ompi/mca/coll/han/coll_han_dynamic.c index 2cda40e34bf..d32b12fbcd7 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.c +++ b/ompi/mca/coll/han/coll_han_dynamic.c @@ -22,31 +22,29 @@ */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id) { - switch (coll_id){ - case ALLGATHER: - case ALLGATHERV: - case ALLREDUCE: - case BCAST: - case GATHER: - case REDUCE: - case SCATTER: - return true; - default: - return false; + switch (coll_id) { + case ALLGATHER: + case ALLGATHERV: + case ALLREDUCE: + case BCAST: + case GATHER: + case REDUCE: + case SCATTER: + return true; + default: + return false; } } -static COMPONENT_T -component_name_to_id(const char* name) +COMPONENT_T +mca_coll_han_component_name_to_id(const char* name) { - int i; - if(NULL == name) { return -1; } - for(i=SELF ; itopologic_level; mca_coll_base_module_t *han_base_module = (mca_coll_base_module_t *) han_module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + int nb_modules = 0; + mca_coll_base_avail_coll_t *item; + /* If the modules are get yet, return success */ if(han_module->storage_initialized) { return OMPI_SUCCESS; @@ -76,7 +75,7 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, mca_coll_base_avail_coll_t) { mca_coll_base_module_t *module = item->ac_module; const char *name = item->ac_component_name; - int id = component_name_to_id(name); + int id = mca_coll_han_component_name_to_id(name); if(id >= 0 && NULL != module && module != han_base_module) { /* @@ -85,16 +84,10 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, */ han_module->modules_storage.modules[id].module_handler = module; opal_output_verbose(80, mca_coll_han_component.han_output, - "coll:han:get_all_coll_modules " - "Han found module %s with id %d " - "for topological level %d (%s) " - "for communicator (%d/%s)\n", - name, - id, - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + "coll:han:get_all_coll_modules HAN found module %s with id %d " + "for topological level %d (%s) for communicator (%d/%s)\n", + name, id, topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); nb_modules++; } } @@ -109,16 +102,11 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, } opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_all_coll_modules " - "Han sub-communicator modules storage " - "for topological level %d (%s) " - "gets %d modules " + "coll:han:get_all_coll_modules HAN sub-communicator modules storage " + "for topological level %d (%s) gets %d modules " "for communicator (%d/%s)\n", - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - nb_modules, - comm->c_contextid, - comm->c_name); + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + nb_modules, comm->c_contextid, comm->c_name); assert(0 != nb_modules); @@ -133,15 +121,13 @@ mca_coll_han_get_all_coll_modules(struct ompi_communicator_t *comm, */ static const msg_size_rule_t* get_dynamic_rule(COLLTYPE_T collective, - int msg_size, - struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) + size_t msg_size, + struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { /* Indexes of the rule */ - int coll_idx; - int topo_idx; - int conf_idx; - int msg_size_idx; + int coll_idx, topo_idx; + int conf_idx, msg_size_idx; /* Aliases */ const mca_coll_han_dynamic_rules_t *dynamic_rules = NULL; @@ -157,107 +143,78 @@ get_dynamic_rule(COLLTYPE_T collective, /* Find the collective rule */ dynamic_rules = &(mca_coll_han_component.dynamic_rules); - for(coll_idx = dynamic_rules->nb_collectives-1 ; - coll_idx >= 0 ; coll_idx--) { + for(coll_idx = dynamic_rules->nb_collectives-1; + coll_idx >= 0; coll_idx--) { if(dynamic_rules->collective_rules[coll_idx].collective_id == collective) { coll_rule = &(dynamic_rules->collective_rules[coll_idx]); break; } } - if(coll_idx < 0) { - /* - * No dynamic rules for this collective - */ + if(coll_idx < 0) { /* No dynamic rules for this collective */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched for collective %d (%s) " + "coll:han:get_dynamic_rule HAN searched for collective %d (%s) " "but did not find any rule for this collective\n", - collective, - mca_coll_han_colltype_to_str(collective)); + collective, mca_coll_base_colltype_to_str(collective)); return NULL; } /* Find the topologic level rule */ - for(topo_idx = coll_rule->nb_topologic_levels-1 ; - topo_idx >= 0 ; topo_idx--) { + for(topo_idx = coll_rule->nb_topologic_levels-1; + topo_idx >= 0; topo_idx--) { if(coll_rule->topologic_rules[topo_idx].topologic_level == topo_lvl) { topo_rule = &(coll_rule->topologic_rules[topo_idx]); break; } } - if(topo_idx < 0) { - /* - * No topologic level rules for this collective - */ + if(topo_idx < 0) { /* No topologic level rules for this collective */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched for topologic level %d (%s) rule " + "coll:han:get_dynamic_rule HAN searched for topologic level %d (%s) rule " "for collective %d (%s) but did not find any rule\n", - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - collective, - mca_coll_han_colltype_to_str(collective)); + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + collective, mca_coll_base_colltype_to_str(collective)); return NULL; } /* Find the configuration rule */ - for(conf_idx = topo_rule->nb_rules-1 ; - conf_idx >= 0 ; conf_idx--) { + for(conf_idx = topo_rule->nb_rules-1; + conf_idx >= 0; conf_idx--) { if(topo_rule->configuration_rules[conf_idx].configuration_size <= comm_size) { conf_rule = &(topo_rule->configuration_rules[conf_idx]); break; } } if(conf_idx < 0) { - /* - * No corresponding configuration - * Should not happen with a correct file - */ - + /* No corresponding configuration. Should not have happen with a correct file */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " "but did not manage to find anything. " "This is the result of an invalid configuration file: " "the first configuration size of each collective must be 1\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size); + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), comm_size); return NULL; } /* Find the message size rule */ - for(msg_size_idx = conf_rule->nb_msg_size-1 ; - msg_size_idx >= 0 ; msg_size_idx--) { + for(msg_size_idx = conf_rule->nb_msg_size-1; + msg_size_idx >= 0; msg_size_idx--) { if(conf_rule->msg_size_rules[msg_size_idx].msg_size <= msg_size) { msg_size_rule = &(conf_rule->msg_size_rules[msg_size_idx]); break; } } if(msg_size_idx < 0) { - /* - * No corresponding message size - * Should not happen with a correct file - */ + /* No corresponding message size. Should not happen with a correct file */ opal_output_verbose(60, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message " - "but did not manage to find anything. " + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message but did not manage to find anything. " "This is the result of an invalid configuration file: " "the first message size of each configuration must be 0\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size, - msg_size); + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size); return NULL; } @@ -268,29 +225,19 @@ get_dynamic_rule(COLLTYPE_T collective, * Module correctness is checked outside */ opal_output_verbose(80, mca_coll_han_component.han_output, - "coll:han:get_dynamic_rule " - "Han searched a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message. " - "Found a rule for collective %d (%s) " - "on topological level %d (%s) " - "for a %d configuration size " - "for a %d sized message : component %d (%s)\n", - collective, - mca_coll_han_colltype_to_str(collective), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm_size, - msg_size, - msg_size_rule->collective_id, - mca_coll_han_colltype_to_str(msg_size_rule->collective_id), + "coll:han:get_dynamic_rule HAN searched a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message. Found a rule for collective %d (%s) " + "on topological level %d (%s) for a %d configuration size " + "for a %" PRIsize_t " sized message : component %d (%s)\n", + collective, mca_coll_base_colltype_to_str(collective), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm_size, msg_size, msg_size_rule->collective_id, + mca_coll_base_colltype_to_str(msg_size_rule->collective_id), msg_size_rule->topologic_level, mca_coll_han_topo_lvl_to_str(msg_size_rule->topologic_level), msg_size_rule->configuration_size, - msg_size_rule->msg_size, - component, - components_name[component]); + msg_size_rule->msg_size, component, available_components[component].component_name); return msg_size_rule; } @@ -300,14 +247,13 @@ get_dynamic_rule(COLLTYPE_T collective, * for a msg_size sized message on the comm communicator * following the dynamic rules */ -mca_coll_base_module_t * +static mca_coll_base_module_t* get_module(COLLTYPE_T coll_id, - int msg_size, + size_t msg_size, struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module) { const msg_size_rule_t *dynamic_rule; - mca_coll_base_module_t *sub_module = NULL; TOPO_LVL_T topo_lvl; COMPONENT_T mca_rule_component; @@ -323,37 +269,26 @@ get_module(COLLTYPE_T coll_id, han_module); if(NULL != dynamic_rule) { /* Use dynamic rule from file */ - sub_module = han_module->modules_storage - .modules[dynamic_rule->component] - .module_handler; - } else { + return han_module->modules_storage.modules[dynamic_rule->component].module_handler; + } + /* + * No dynamic rule from file + * Use rule from mca parameter + */ + if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { /* - * No dynamic rule from file - * Use rule from mca parameter + * Invalid MCA parameter value + * Warn the user and return NULL */ - if(mca_rule_component < 0 || mca_rule_component >= COMPONENTS_COUNT) { - /* - * Invalid MCA parameter value - * Warn the user and return NULL - */ - opal_output_verbose(0, mca_coll_han_component.han_output, - "coll:han:get_module " - "Invalid MCA parameter value %d " - "for collective %d (%s) " - "on topologic level %d (%s)\n", - mca_rule_component, - coll_id, - mca_coll_han_colltype_to_str(coll_id), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl)); - return NULL; - } - sub_module = han_module->modules_storage - .modules[mca_rule_component] - .module_handler; + opal_output_verbose(0, mca_coll_han_component.han_output, + "coll:han:get_module Invalid MCA parameter value %d " + "for collective %d (%s) on topologic level %d (%s)\n", + mca_rule_component, coll_id, + mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl)); + return NULL; } - - return sub_module; + return han_module->modules_storage.modules[mca_rule_component].module_handler; } @@ -365,38 +300,35 @@ get_module(COLLTYPE_T coll_id, */ int mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgather_fn_t allgather; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ - ompi_datatype_type_size(sdtype, &dtype_size); - msg_size = dtype_size * scount; - + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } sub_module = get_module(ALLGATHER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -408,26 +340,17 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgather_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLGATHER, - mca_coll_han_colltype_to_str(ALLGATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHER: No module found for the sub-" - "communicator. " + "HAN/ALLGATHER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - han_module - ->previous_allgather_module); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; } else if (NULL == sub_module->coll_allgather) { /* * No valid collective from dynamic rules @@ -435,62 +358,43 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, */ han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_allgather_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "coll:han:mca_coll_han_allgather_intra_dynamic HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLGATHER, - mca_coll_han_colltype_to_str(ALLGATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHER, mca_coll_base_colltype_to_str(ALLGATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHER: the module found for the sub-" - "communicator cannot handle the ALLGATHER operation. " - "Falling back to another component\n")); - return han_module->previous_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - han_module - ->previous_allgather_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + "HAN/ALLGATHER: the module found for the sub-communicator" + " cannot handle the ALLGATHER operation. Falling back to another component\n")); + allgather = han_module->previous_allgather; + sub_module = han_module->previous_allgather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_allgather is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_allgather_fn_t allgather; if(mca_coll_han_component.use_simple_algorithm[ALLGATHER]) { allgather = mca_coll_han_allgather_intra_simple; } else { allgather = mca_coll_han_allgather_intra; } - - return allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - sub_module); + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgather = sub_module->coll_allgather; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allgather is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allgather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - comm, - sub_module); + return allgather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + comm, + sub_module); } @@ -503,30 +407,25 @@ mca_coll_han_allgather_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, const int *rcounts, - const int *displs, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, const int *rcounts, + const int *displs, + struct ompi_datatype_t *rdtype, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size, msg_size; - int rank; - int verbosity; - int comm_size; - int i; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allgatherv_fn_t allgatherv; + int rank, verbosity = 0, comm_size, i; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size, msg_size = 0; /* Compute configuration information for dynamic rules */ comm_size = ompi_comm_size(comm); ompi_datatype_type_size(rdtype, &dtype_size); - msg_size = 0; - for(i = 0 ; i < comm_size ; i++) { + for(i = 0; i < comm_size; i++) { if(dtype_size * rcounts[i] > msg_size) { msg_size = dtype_size * rcounts[i]; } @@ -539,11 +438,7 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -555,26 +450,17 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLGATHERV: No module found for the sub-" - "communicator. " + "HAN/ALLGATHERV: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; } else if (NULL == sub_module->coll_allgatherv) { /* * No valid collective from dynamic rules @@ -583,31 +469,24 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/ALLGATHERV: the module found for the sub-" "communicator cannot handle the ALLGATHERV operation. " "Falling back to another component\n")); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + rbuf, rcounts, displs, + rdtype, comm, + han_module->previous_allgatherv_module); + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid @@ -616,36 +495,28 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, */ opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allgatherv_intra_dynamic " - "Han used for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " + "HAN used for collective %d (%s) with topological level %d (%s) " + "on communicator (%d/%s) but this module cannot handle " "this collective on this topologic level\n", - ALLGATHERV, - mca_coll_han_colltype_to_str(ALLGATHERV), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); - return han_module->previous_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - han_module - ->previous_allgatherv_module); + ALLGATHERV, mca_coll_base_colltype_to_str(ALLGATHERV), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); + allgatherv = han_module->previous_allgatherv; + sub_module = han_module->previous_allgatherv_module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_allgatherv is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + allgatherv = sub_module->coll_allgatherv; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allgatherv is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allgatherv(sbuf, scount, sdtype, - rbuf, rcounts, displs, - rdtype, comm, - sub_module); + return allgatherv(sbuf, scount, sdtype, + rbuf, rcounts, displs, + rdtype, comm, + sub_module); } @@ -657,39 +528,32 @@ mca_coll_han_allgatherv_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_allreduce_intra_dynamic(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_allreduce_fn_t allreduce; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(ALLREDUCE, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -701,25 +565,17 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allreduce_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - ALLREDUCE, - mca_coll_han_colltype_to_str(ALLREDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/ALLREDUCE: No module found for the sub-" - "communicator. " + "HAN/ALLREDUCE: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, - op, comm, - han_module - ->previous_allreduce_module); + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; } else if (NULL == sub_module->coll_allreduce) { /* * No valid collective from dynamic rules @@ -728,60 +584,49 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_allreduce_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - ALLREDUCE, - mca_coll_han_colltype_to_str(ALLREDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + ALLREDUCE, mca_coll_base_colltype_to_str(ALLREDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/ALLREDUCE: the module found for the sub-" "communicator cannot handle the ALLREDUCE operation. " "Falling back to another component\n")); - return han_module->previous_allreduce(sbuf, rbuf, count, dtype, - op, comm, - han_module - ->previous_allreduce_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + allreduce = han_module->previous_allreduce; + sub_module = han_module->previous_allreduce_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* Reproducibility: fallback on reproducible algo */ if (mca_coll_han_component.han_reproducible) { - return mca_coll_han_allreduce_reproducible(sbuf, rbuf, count, dtype, op, - comm, module); + allreduce = mca_coll_han_allreduce_reproducible; + } else { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_allreduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { + allreduce = mca_coll_han_allreduce_intra_simple; + } else { + allreduce = mca_coll_han_allreduce_intra; + } } + sub_module = module; + } else { /* - * No fallback mechanism activated for this configuration + * If we get here: * sub_module is valid - * sub_module->coll_allreduce is valid and point to this function - * Call han topological collective algorithm + * sub_module->coll_allreduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective */ - mca_coll_base_module_allreduce_fn_t allreduce; - if(mca_coll_han_component.use_simple_algorithm[ALLREDUCE]) { - allreduce = mca_coll_han_allreduce_intra_simple; - } else { - allreduce = mca_coll_han_allreduce_intra; - } - return allreduce(sbuf, rbuf, count, dtype, - op, comm, module); + allreduce = mca_coll_han_allreduce_intra; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_allreduce is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_allreduce(sbuf, rbuf, count, dtype, - op, comm, sub_module); + return allreduce(sbuf, rbuf, count, dtype, + op, comm, sub_module); } @@ -793,38 +638,31 @@ mca_coll_han_allreduce_intra_dynamic(const void *sbuf, */ int mca_coll_han_bcast_intra_dynamic(void *buff, - int count, - struct ompi_datatype_t *dtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + int count, + struct ompi_datatype_t *dtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_bcast_fn_t bcast; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(BCAST, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -836,23 +674,17 @@ mca_coll_han_bcast_intra_dynamic(void *buff, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_bcast_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - BCAST, - mca_coll_han_colltype_to_str(BCAST), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/BCAST: No module found for the sub-" - "communicator. " + "HAN/BCAST: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, comm, - han_module->previous_bcast_module); + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; } else if (NULL == sub_module->coll_bcast) { /* * No valid collective from dynamic rules @@ -861,61 +693,44 @@ mca_coll_han_bcast_intra_dynamic(void *buff, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_bcast_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - BCAST, - mca_coll_han_colltype_to_str(BCAST), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + BCAST, mca_coll_base_colltype_to_str(BCAST), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/BCAST: the module found for the sub-" "communicator cannot handle the BCAST operation. " "Falling back to another component\n")); - return han_module->previous_bcast(buff, count, dtype, root, comm, - han_module->previous_bcast_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + bcast = han_module->previous_bcast; + sub_module = han_module->previous_bcast_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_bcast is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_bcast_fn_t bcast; if(mca_coll_han_component.use_simple_algorithm[BCAST]) { bcast = mca_coll_han_bcast_intra_simple; } else { bcast = mca_coll_han_bcast_intra; } - return bcast(buff, - count, - dtype, - root, - comm, - module); + sub_module = module; + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_bcast is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + bcast = sub_module->coll_bcast; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_bcast is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_bcast(buff, - count, - dtype, - root, - comm, - sub_module); + return bcast(buff, count, dtype, + root, comm, sub_module); } @@ -927,39 +742,37 @@ mca_coll_han_bcast_intra_dynamic(void *buff, */ int mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_gather_fn_t gather; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ - ompi_datatype_type_size(sdtype, &dtype_size); - msg_size = dtype_size * scount; + if( MPI_IN_PLACE != sbuf ) { + ompi_datatype_type_size(sdtype, &dtype_size); + dtype_size = dtype_size * scount; + } else { + ompi_datatype_type_size(rdtype, &dtype_size); + dtype_size = dtype_size * rcount; + } sub_module = get_module(GATHER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -971,26 +784,17 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_gather_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - GATHER, - mca_coll_han_colltype_to_str(GATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/GATHER: No module found for the sub-" - "communicator. " + "HAN/GATHER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_gather_module); + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; } else if (NULL == sub_module->coll_gather) { /* * No valid collective from dynamic rules @@ -999,62 +803,45 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_gather_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - GATHER, - mca_coll_han_colltype_to_str(GATHER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + GATHER, mca_coll_base_colltype_to_str(GATHER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/GATHER: the module found for the sub-" "communicator cannot handle the GATHER operation. " "Falling back to another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_gather_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + gather = han_module->previous_gather; + sub_module = han_module->previous_gather_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_gather is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_gather_fn_t gather; if(mca_coll_han_component.use_simple_algorithm[GATHER]) { gather = mca_coll_han_gather_intra_simple; } else { gather = mca_coll_han_gather_intra; } - - - return gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + } else { + /* + * If we get here: + * sub_module is valid + * sub_module->coll_gather is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective + */ + gather = sub_module->coll_gather; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_gather is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_gather(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + return gather(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); } @@ -1066,40 +853,33 @@ mca_coll_han_gather_intra_dynamic(const void *sbuf, int scount, */ int mca_coll_han_reduce_intra_dynamic(const void *sbuf, - void *rbuf, - int count, - struct ompi_datatype_t *dtype, - struct ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void *rbuf, + int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_reduce_fn_t reduce; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(dtype, &dtype_size); - msg_size = dtype_size * count; + dtype_size = dtype_size * count; sub_module = get_module(REDUCE, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -1111,25 +891,17 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_reduce_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - REDUCE, - mca_coll_han_colltype_to_str(REDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/REDUCE: No module found for the sub-" - "communicator. " + "HAN/REDUCE: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_reduce(sbuf, rbuf, count, dtype, - op, root, comm, - han_module - ->previous_reduce_module); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; } else if (NULL == sub_module->coll_reduce) { /* * No valid collective from dynamic rules @@ -1138,60 +910,51 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_reduce_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - REDUCE, - mca_coll_han_colltype_to_str(REDUCE), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + REDUCE, mca_coll_base_colltype_to_str(REDUCE), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/REDUCE: the module found for the sub-" "communicator cannot handle the REDUCE operation. " "Falling back to another component\n")); - return han_module->previous_reduce(sbuf, rbuf, count, dtype, - op, root, comm, - han_module - ->previous_reduce_module); + reduce = han_module->previous_reduce; + sub_module = han_module->previous_reduce_module; } if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* Reproducibility: fallback on reproducible algo */ if (mca_coll_han_component.han_reproducible) { - return mca_coll_han_reduce_reproducible(sbuf, rbuf, count, dtype, op, - root, comm, module); + reduce = mca_coll_han_reduce_reproducible; + } else { + /* + * No fallback mechanism activated for this configuration + * sub_module is valid + * sub_module->coll_reduce is valid and point to this function + * Call han topological collective algorithm + */ + if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { + reduce = mca_coll_han_reduce_intra_simple; + } else { + reduce = mca_coll_han_reduce_intra; + } } + sub_module = module; + } else { /* - * No fallback mechanism activated for this configuration + * If we get here: * sub_module is valid - * sub_module->coll_reduce is valid and point to this function - * Call han topological collective algorithm + * sub_module->coll_reduce is valid + * They points to the collective to use, according to the dynamic rules + * Selector's job is done, call the collective */ - mca_coll_base_module_reduce_fn_t reduce; - if(mca_coll_han_component.use_simple_algorithm[REDUCE]) { - reduce = mca_coll_han_reduce_intra_simple; - } else { - reduce = mca_coll_han_reduce_intra; - } - return reduce(sbuf, rbuf, count, dtype, - op, root, comm, module); + reduce = sub_module->coll_reduce; } - - /* - * If we get here: - * sub_module is valid - * sub_module->coll_reduce is valid - * They points to the collective to use, according to the dynamic rules - * Selector's job is done, call the collective - */ - return sub_module->coll_reduce(sbuf, rbuf, count, dtype, - op, root, comm, sub_module); + return reduce(sbuf, rbuf, count, dtype, + op, root, comm, sub_module); } @@ -1203,39 +966,32 @@ mca_coll_han_reduce_intra_dynamic(const void *sbuf, */ int mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - size_t dtype_size; - int msg_size; - int rank; - int verbosity; mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module; + TOPO_LVL_T topo_lvl = han_module->topologic_level; + mca_coll_base_module_scatter_fn_t scatter; mca_coll_base_module_t *sub_module; - TOPO_LVL_T topo_lvl; - - topo_lvl = han_module->topologic_level; + size_t dtype_size; + int rank, verbosity = 0; /* Compute configuration information for dynamic rules */ ompi_datatype_type_size(rdtype, &dtype_size); - msg_size = dtype_size * rcount; + dtype_size = dtype_size * rcount; sub_module = get_module(SCATTER, - msg_size, + dtype_size, comm, han_module); /* First errors are always printed by rank 0 */ rank = ompi_comm_rank(comm); - if(0 == rank - && han_module->dynamic_errors - < mca_coll_han_component.max_dynamic_errors) { - verbosity = 0; - } else { + if( (0 == rank) && (han_module->dynamic_errors < mca_coll_han_component.max_dynamic_errors) ) { verbosity = 30; } @@ -1247,26 +1003,17 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_scatter_intra_dynamic " - "Han did not find any valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s). " + "HAN did not find any valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s). " "Please check dynamic file/mca parameters\n", - SCATTER, - mca_coll_han_colltype_to_str(SCATTER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "HAN/SCATTER: No module found for the sub-" - "communicator. " + "HAN/SCATTER: No module found for the sub-communicator. " "Falling back to another component\n")); - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_scatter_module); + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; } else if (NULL == sub_module->coll_scatter) { /* * No valid collective from dynamic rules @@ -1275,38 +1022,26 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, han_module->dynamic_errors++; opal_output_verbose(verbosity, mca_coll_han_component.han_output, "coll:han:mca_coll_han_scatter_intra_dynamic " - "Han found valid module for " - "collective %d (%s) " - "with topological level %d (%s) " - "on communicator (%d/%s) " - "but this module cannot handle " - "this collective. " + "HAN found valid module for collective %d (%s) " + "with topological level %d (%s) on communicator (%d/%s) " + "but this module cannot handle this collective. " "Please check dynamic file/mca parameters\n", - SCATTER, - mca_coll_han_colltype_to_str(SCATTER), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - comm->c_contextid, - comm->c_name); + SCATTER, mca_coll_base_colltype_to_str(SCATTER), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), + comm->c_contextid, comm->c_name); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/SCATTER: the module found for the sub-" "communicator cannot handle the SCATTER operation. " "Falling back to another component\n")); - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module - ->previous_scatter_module); - } - - if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { + scatter = han_module->previous_scatter; + sub_module = han_module->previous_scatter_module; + } else if (GLOBAL_COMMUNICATOR == topo_lvl && sub_module == module) { /* * No fallback mechanism activated for this configuration * sub_module is valid * sub_module->coll_scatter is valid and point to this function * Call han topological collective algorithm */ - mca_coll_base_module_scatter_fn_t scatter; scatter = mca_coll_han_scatter_intra; /* * TODO: Uncomment when scatter simple is merged @@ -1316,10 +1051,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * scatter = mca_coll_han_scatter_intra; * } */ - return scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + } else { + scatter = sub_module->coll_scatter; } /* @@ -1329,10 +1062,8 @@ mca_coll_han_scatter_intra_dynamic(const void *sbuf, int scount, * They points to the collective to use, according to the dynamic rules * Selector's job is done, call the collective */ - return sub_module->coll_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - sub_module); + return scatter(sbuf, scount, sdtype, + rbuf, rcount, rdtype, + root, comm, + sub_module); } - - diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h index 979b292ba0f..0ccecb63ba3 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.h +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -1,5 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ @@ -27,9 +30,9 @@ * ################################################# * * Han dynamic rules allow the user to define the collective - * module to call depending the topological configuration of the + * module to call depending on the topological configuration of the * sub-communicators and the collective parameters. This mechanism - * can also be used to fallback the main collective on another module. + * can also be used to fallback to the main collective on another module. * The interface is described in coll_han_dynamic_file.h. * * ############################# @@ -39,7 +42,7 @@ * directly accesses the module on the communicator. This information is * stored in the collective structure of the communicator during the collective * module choice at the communicator initialization. When han needs this - * information for the first time, it identifies the modles by their name and + * information for the first time, it identifies the modules by their name and * stores them in its module structure. * Then, the modules are identified by their identifier. * @@ -69,7 +72,7 @@ * adds an indirection on the collective call: dynamic choice functions. These * functions do not implement any collective. First, they try to find a dynamic * rule from file for the given collective. If there is not any rule for the - * fiven configuration, MCA parameter defined rules are used. Once the module + * given configuration, MCA parameter defined rules are used. Once the module * to use is found, the correct collective implementation is called. * * This indirection is also used on the global communicator. This allows han @@ -92,11 +95,9 @@ * by increasing value, some of them will not be considered */ -BEGIN_C_DECLS - /* Dynamic rules support */ typedef enum COMPONENTS { - SELF=0, + SELF = 0, BASIC, LIBNBC, TUNED, @@ -107,18 +108,17 @@ typedef enum COMPONENTS { COMPONENTS_COUNT } COMPONENT_T; -static const char *components_name[]={"self", - "basic", - "libnbc", - "tuned", - "sm", - "shared", - "adapt", - "han"}; +typedef struct { + COMPONENT_T id; + char* component_name; + mca_coll_base_component_t* component; +} ompi_coll_han_components; + +extern ompi_coll_han_components available_components[COMPONENTS_COUNT]; /* Topologic levels */ typedef enum TOPO_LVL { - INTRA_NODE=0, + INTRA_NODE = 0, INTER_NODE, /* Identifies the global communicator as a topologic level */ GLOBAL_COMMUNICATOR, @@ -135,7 +135,7 @@ typedef struct msg_size_rule_s { int configuration_size; /* Message size of the rule */ - int msg_size; + size_t msg_size; /* Component to use on this specific configuration * and message size */ @@ -209,6 +209,6 @@ typedef struct mca_coll_han_collective_modules_storage_s { /* Tests if a dynamic collective is implemented */ bool mca_coll_han_is_coll_dynamic_implemented(COLLTYPE_T coll_id); +COMPONENT_T mca_coll_han_component_name_to_id(const char* name); -END_C_DECLS #endif diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index d163071edc2..fc1fbbaa767 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -26,11 +26,14 @@ #include "ompi/mca/coll/base/coll_base_util.h" +#define getnext_long(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) +#define getnext_string(fptr, pval) ompi_coll_base_file_getnext_string(fptr, &fileline, pval) +#define getnext_size_t(fptr, pval) ompi_coll_base_file_getnext_size_t(fptr, &fileline, pval) + static void check_dynamic_rules(void); /* Current file line for verbose message */ static int fileline = 1; -#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) int mca_coll_han_init_dynamic_rules(void) @@ -38,31 +41,31 @@ mca_coll_han_init_dynamic_rules(void) /* File management */ const char *fname; FILE *fptr = NULL; - int nb_entries = 0; + int nb_entries = 0, rc; /* Loop counters */ int i, j, k, l; /* Collective informations */ - int nb_coll; - COLLTYPE_T coll_id; + long nb_coll, coll_id; + char * coll_name = NULL; collective_rule_t *coll_rules; /* Topo informations */ - int nb_topo; - TOPO_LVL_T topo_lvl; + long nb_topo, topo_lvl; topologic_rule_t *topo_rules; /* Configuration informations */ - int nb_rules, conf_size; + long nb_rules, conf_size; configuration_rule_t *conf_rules; /* Message size informations */ - int nb_msg_size, msg_size; + long nb_msg_size; + size_t msg_size; msg_size_rule_t *msg_size_rules; /* Component informations */ - COMPONENT_T component; + long component; /* If the dynamic rules are not used, do not even read the file */ if(!mca_coll_han_component.use_dynamic_file_rules) { @@ -70,47 +73,31 @@ mca_coll_han_init_dynamic_rules(void) return OMPI_SUCCESS; } - fname = mca_coll_han_component.dynamic_rules_filename; - - if(NULL == fname) { + if( NULL == (fname = mca_coll_han_component.dynamic_rules_filename) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "coll_han_use_dynamic_file_rules is true but " - "coll_han_dynamic_rules_filename is not set: " - "coll han will use dynamic rules from mca " - "parameters and their default value\n"); + "coll:han:mca_coll_han_init_dynamic_rules coll_han_use_dynamic_file_rules is set but " + "coll_han_dynamic_rules_filename is not Rules from MCA parameters will be used instead\n"); mca_coll_han_component.dynamic_rules.nb_collectives = 0; return OMPI_SUCCESS; } - fptr = fopen(fname, "r"); - - if(NULL == fptr) { + if( NULL == (fptr = fopen(fname, "r")) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "cannot open dynamic file provided by " - "coll_han_dynamic_rules_filename=%s " - "please provide it with full path and " - "check file permissions. Rules from " - "MCA parameters will be used instead\n", + "coll:han:mca_coll_han_init_dynamic_rules cannot open dynamic file provided by " + "coll_han_dynamic_rules_filename=%s. Make sure it provides the full path and " + "check file permissions. Rules from MCA parameters will be used instead\n", fname); mca_coll_han_component.dynamic_rules.nb_collectives = 0; return OMPI_SUCCESS; } /* The first information of the file is the collective count */ - nb_coll = getnext(fptr); - - if(nb_coll <= 0) { + if( (getnext_long(fptr, &nb_coll) < 0) || (nb_coll <= 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for collective count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for collective count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_coll); + fname, fileline, nb_coll); mca_coll_han_component.dynamic_rules.nb_collectives = 0; goto file_reading_error; } @@ -126,69 +113,69 @@ mca_coll_han_init_dynamic_rules(void) } /* Iterates on collective rules */ - for(i=0 ; i= COLLCOUNT) { + if( getnext_string(fptr, &coll_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "invalid collective id %d at line %d: the collective " - "must be at least %d and less than %d\n", - coll_id, - fileline, - ALLGATHER, - COLLCOUNT); - coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d." + "The rest of the input file will be ignored.\n", + fileline); goto file_reading_error; } + coll_id = mca_coll_base_name_to_colltype(coll_name); + if( (coll_id < ALLGATHER) || (coll_id >= COLLCOUNT)) { + /* maybe the file was in the old format and we read the collective index instead of the name. */ + char* endp; + coll_id = strtol(coll_name, &endp, 10); + if( '\0' != *endp ) { /* there is garbage in the input */ + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules invalid collective %s " + "at line %d: the collective must be at least %d and less than %d. " + "The rest of the input file will be ignored.\n", + coll_name, fileline, ALLGATHER, COLLCOUNT); + goto file_reading_error; + } + free(coll_name); + coll_name = NULL; + const char *tmp_name = mca_coll_base_colltype_to_str(coll_id); + if (NULL != tmp_name) { + coll_name = strdup(tmp_name); + } + } if(!mca_coll_han_is_coll_dynamic_implemented(coll_id)) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "read collective id %d at line %d " - "but this collective is not implemented yet. " - "This is not an error but this set of rules " - "will not be used\n", - fname, - coll_id, - fileline); + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "read collective id %ld at line %d but this collective is not implemented yet. " + "This is not an error but this set of rules will not be used\n", + fname, coll_id, fileline); } /* * The first information of a collective rule * is the number of topologic rules */ - nb_topo = getnext(fptr); - if(nb_topo < 0) { + if( (getnext_long(fptr, &nb_topo) < 0) || (nb_topo < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for topo level count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_topo); - coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_topo); goto file_reading_error; } /* Store the collective rule informations */ - coll_rules[i].collective_id = coll_id; coll_rules[i].nb_topologic_levels = nb_topo; + coll_rules[i].collective_id = (COLLTYPE_T)coll_id; if(0 == nb_topo) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for topo level count\n", - fname, - fileline, - nb_topo); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for topo level count\n", + fname, fileline, nb_topo); continue; } @@ -197,30 +184,21 @@ mca_coll_han_init_dynamic_rules(void) coll_rules[i].topologic_rules = topo_rules; if(NULL == topo_rules) { coll_rules[i].nb_topologic_levels = 0; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterates on topologic rules */ - for(j=0 ; j= NB_TOPO_LVL) { + if( (getnext_long(fptr, &topo_lvl) < 0) || (topo_lvl < INTRA_NODE) || (topo_lvl >= NB_TOPO_LVL) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid topo level %d is given " - "or the reader encountered an unexpected EOF. " - "Topologic level must be at least %d and " - "less than %d\n", - fname, - fileline, - topo_lvl, - INTRA_NODE, - NB_TOPO_LVL); - topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid topo level %ld is given or the reader encountered an unexpected EOF. " + "Topologic level must be at least %d and less than %d\n", + fname, fileline, topo_lvl, INTRA_NODE, NB_TOPO_LVL); goto file_reading_error; } @@ -228,38 +206,26 @@ mca_coll_han_init_dynamic_rules(void) * The first information of a topologic rule * is the number of configurations */ - nb_rules = getnext(fptr); - - if(nb_rules < 0) { + nb_rules = -1; + if( (getnext_long(fptr, &nb_rules) < 0) || (nb_rules < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for rules count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for rules count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_rules); - topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_rules); goto file_reading_error; } /* Store the topologic rule informations */ topo_rules[j].collective_id = coll_id; - topo_rules[j].topologic_level = topo_lvl; + topo_rules[j].topologic_level = (TOPO_LVL_T)topo_lvl; topo_rules[j].nb_rules = nb_rules; if(0 == nb_rules) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for configuration rules count\n", - fname, - fileline, - nb_rules); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for configuration rules count\n", + fname, fileline, nb_rules); continue; } @@ -268,32 +234,21 @@ mca_coll_han_init_dynamic_rules(void) topo_rules[j].configuration_rules = conf_rules; if(NULL == conf_rules) { topo_rules[j].nb_rules = 0; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterate on configuration rules */ - for(k=0 ; k 1)) { + /* Get the configuration size */ + if( (getnext_long(fptr, &conf_size) < 0) || (conf_size < 1) || (0 == k && conf_size > 1) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "invalid configuration size %d at line %d " - "or the reader encountered an unexpected EOF " - "the configuration size must be at least %d " - "and the first configuration size " - "of a topologic level must be %d\n", - conf_size, - fileline, - 1, - 1); - conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules invalid configuration size %ld at line %d " + "or the reader encountered an unexpected EOF the configuration size must be at least %d " + "and the first configuration size of a topologic level must be %d\n", + conf_size, fileline, 1, 1); goto file_reading_error; } @@ -301,21 +256,12 @@ mca_coll_han_init_dynamic_rules(void) * The first information of a configuration rule * is the number of message size rules */ - nb_msg_size = getnext(fptr); - if(nb_msg_size < 0) { + if( (getnext_long(fptr, &nb_msg_size) < 0) || (nb_msg_size < 0) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for message size rules count " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count " "or the reader encountered an unexpected EOF\n", - fname, - fileline, - nb_msg_size); - conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, nb_msg_size); goto file_reading_error; } @@ -327,13 +273,9 @@ mca_coll_han_init_dynamic_rules(void) if(0 == nb_msg_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on dynamic rules file %s " - "at line %d: an invalid value %d is given " - "for message size rules count\n", - fname, - fileline, - nb_msg_size); + "coll:han:mca_coll_han_init_dynamic_rules Warning on dynamic rules file %s " + "at line %d: an invalid value %ld is given for message size rules count\n", + fname, fileline, nb_msg_size); continue; } @@ -342,88 +284,99 @@ mca_coll_han_init_dynamic_rules(void) conf_rules[k].msg_size_rules = msg_size_rules; if(NULL == msg_size_rules) { conf_rules[k].nb_msg_size = 0; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; goto cannot_allocate; } /* Iterate on message size rules */ - for(l=0 ; l 1)) { + rc = getnext_size_t(fptr, &msg_size); + if( (rc < 0) || + (0 == l && msg_size > 1)) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid value %d " - "is given for message size " - "or the reader encountered " - "an unexpected EOF. " - "The first message size rule of " - "a configuration must be 0\n", - fname, - fileline, - msg_size); - conf_rules[k].nb_msg_size = l+1; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid value %" PRIsize_t " is given for message size " + "or the reader encountered an unexpected EOF. " + "The first message size rule of a configuration must be 0\n", + fname, fileline, msg_size); goto file_reading_error; } /* Get the component identifier for this message size rule */ - component = getnext(fptr); - if(component < SELF || component >= COMPONENTS_COUNT) { + if( getnext_string(fptr, &target_comp_name) < 0 ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: cannot read the name of a collective component\n", + fname, fileline); + goto file_reading_error; + } + component = mca_coll_han_component_name_to_id(target_comp_name); + if( (component < SELF) || (component >= COMPONENTS_COUNT) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "found an error on dynamic rules file %s " - "at line %d: an invalid collective " - "component id %d is given or the " - "reader encountered an unexpected EOF. " - "Collective component id must be at " + "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " + "at line %d: an invalid collective component name %s was given or the " + "reader encountered an unexpected EOF. Collective component id must be at " "least %d and less than %d\n", - fname, - fileline, - component, - SELF, - COMPONENTS_COUNT); - conf_rules[k].nb_msg_size = l+1; - topo_rules[j].nb_rules = k+1; - coll_rules[i].nb_topologic_levels = j+1; - mca_coll_han_component.dynamic_rules.nb_collectives = i+1; + fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT); + free(target_comp_name); goto file_reading_error; } - /* Store message size rule informations */ + /* Store message size rule information */ msg_size_rules[l].collective_id = coll_id; msg_size_rules[l].topologic_level = topo_lvl; msg_size_rules[l].configuration_size = conf_size; msg_size_rules[l].msg_size = msg_size; - msg_size_rules[l].component = component; + msg_size_rules[l].component = (COMPONENT_T)component; nb_entries++; + /* do we have the optional segment length */ + if( 1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '[') ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules found optional pipelining segment lengths\n"); + long seglength; + if( 0 != topo_lvl ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found segment lengths for topological collective at level != 0 " + "for collective %s component %s. These values will be ignored.\n", + fname, fileline, coll_name, target_comp_name); + } + while( 0 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, ']') ) { + if( getnext_long(fptr, &seglength) ) { + opal_output_verbose(5, mca_coll_han_component.han_output, + "coll:han:mca_coll_han_init_dynamic_rules " + "file %s line %d found end of file while reading the optional list " + "of segment lengths for collective %s component %s\n", + fname, fileline, coll_name, target_comp_name); + free(target_comp_name); + goto file_reading_error; + } + } + } + free(target_comp_name); } } } + if( NULL != coll_name ) { + free(coll_name); + coll_name = NULL; + } } - if(MYEOF != getnext(fptr)) { + if( getnext_long(fptr, &nb_coll) > 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "Warning on file %s at line %d: " - "rule reading is over but reader does not seem " - "to have reached the end of the file\n", - fname, - fileline); + "coll:han:mca_coll_han_init_dynamic_rules. Warning on file %s at line %d: " + "rule reading is over but reader does not seem to have reached the end of the file\n", + fname, fileline); } opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:mca_coll_han_init_dynamic_rules " - "read %d rules from %s\n", - nb_entries, - fname); + "coll:han:mca_coll_han_init_dynamic_rules read %d rules from %s\n", + nb_entries, fname); if(mca_coll_han_component.dump_dynamic_rules) { mca_coll_han_dump_dynamic_rules(); @@ -447,6 +400,9 @@ mca_coll_han_init_dynamic_rules(void) return OMPI_ERROR; file_reading_error: + if( NULL != coll_name ) { + free(coll_name); + } opal_output_verbose(0, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "could not fully read dynamic rules file. " @@ -531,7 +487,8 @@ static void check_dynamic_rules(void) configuration_rule_t *conf_rules; /* Message size informations */ - int nb_msg_size, msg_size; + int nb_msg_size; + size_t msg_size; msg_size_rule_t *msg_size_rules; /* Component informations */ @@ -540,73 +497,49 @@ static void check_dynamic_rules(void) nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; - for(i=0 ; i=1 && conf_rules[k-1].configuration_size > conf_size) { + if( k >= 1 && conf_rules[k-1].configuration_size > conf_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d: " - "configuration sizes %d and %d are " - "not sorted by increasing value\n", - coll_id, - topo_lvl, - conf_rules[k-1].configuration_size, - conf_size); + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d: " + "configuration sizes %d and %d are not sorted by increasing value\n", + coll_id, topo_lvl, conf_rules[k-1].configuration_size, conf_size); } - for(l=0 ; l=1 && msg_size_rules[l-1].msg_size > msg_size) { + if( l >= 1 && msg_size_rules[l-1].msg_size > msg_size) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d " - "with configuration size %d: " - "message sizes %d and %d are " + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d: " + "message sizes %" PRIsize_t " and %" PRIsize_t " are " "not sorted by increasing value\n", - coll_id, - topo_lvl, - conf_size, - msg_size_rules[l-1].msg_size, - msg_size); + coll_id, topo_lvl, conf_size, msg_size_rules[l-1].msg_size, msg_size); } - if(HAN == component - && GLOBAL_COMMUNICATOR != topo_lvl) { + if( (HAN == component) && (GLOBAL_COMMUNICATOR != topo_lvl) ) { opal_output_verbose(5, mca_coll_han_component.han_output, - "coll:han:check_dynamic_rules " - "Han found an issue on dynamic rules " - "for collective %d " - "on topological level %d " - "with configuration size %d " - "for message size %d: " - "han collective component %d " - "can only be activated for " - "topology level %d\n", - coll_id, - topo_lvl, - conf_size, - msg_size, - HAN, - GLOBAL_COMMUNICATOR); + "coll:han:check_dynamic_rules HAN found an issue on dynamic rules " + "for collective %d on topological level %d with configuration size %d " + "for message size %" PRIsize_t ": han collective component %d " + "can only be activated for topology level %d\n", + coll_id, topo_lvl, conf_size, msg_size, HAN, GLOBAL_COMMUNICATOR); } } } @@ -618,9 +551,6 @@ void mca_coll_han_dump_dynamic_rules(void) { int nb_entries = 0; - /* Loop counters */ - int i, j, k, l; - /* Collective informations */ int nb_coll; COLLTYPE_T coll_id; @@ -645,42 +575,32 @@ void mca_coll_han_dump_dynamic_rules(void) nb_coll = mca_coll_han_component.dynamic_rules.nb_collectives; coll_rules = mca_coll_han_component.dynamic_rules.collective_rules; - for(i=0 ; i collective component %d (%s)\n", - nb_entries, - coll_id, - mca_coll_han_colltype_to_str(coll_id), - topo_lvl, - mca_coll_han_topo_lvl_to_str(topo_lvl), - conf_size, - msg_size, - component, - components_name[component]); + "coll:han:dump_dynamic_rules %d collective %d (%s) " + "topology level %d (%s) configuration size %d " + "mesage size %d -> collective component %d (%s)\n", + nb_entries, coll_id, mca_coll_base_colltype_to_str(coll_id), + topo_lvl, mca_coll_han_topo_lvl_to_str(topo_lvl), conf_size, + msg_size, component, available_components[component].component_name); nb_entries++; } diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.h b/ompi/mca/coll/han/coll_han_dynamic_file.h index 846b9b74cc7..b61ba0c5d8d 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.h +++ b/ompi/mca/coll/han/coll_han_dynamic_file.h @@ -1,5 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* + * Copyright (c) 2020 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * * $COPYRIGHT$ @@ -60,10 +63,9 @@ * communicator and the corresponding level for sub-communicators * created by han. * - Configuration size: - * The configuration size is the number of elements of the actual - * topology level in the upper topology level. For example, if - * topology levels are intra-node and inter-node, it can be the - * number of MPI ranks per node or the number of nodes in the global + * The configuration size is the number of elements in a topology level. + * For example, if topology levels are intra-node and inter-node, it can + * be the number of MPI ranks per node or the number of nodes in the global * communicator. For the GLOBAL_COMMUNICATOR topologic level, * the configuration size is the communicator size. * - Message_size Component: @@ -101,11 +103,8 @@ * the reader. */ -BEGIN_C_DECLS - int mca_coll_han_init_dynamic_rules(void); void mca_coll_han_free_dynamic_rules(void); void mca_coll_han_dump_dynamic_rules(void); -END_C_DECLS #endif diff --git a/ompi/mca/coll/han/coll_han_gather.c b/ompi/mca/coll/han/coll_han_gather.c index 2cbd6d976ce..14b87bde926 100644 --- a/ompi/mca/coll/han/coll_han_gather.c +++ b/ompi/mca/coll/han/coll_han_gather.c @@ -16,40 +16,45 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_gather_lg_task(void *task_args); +static int mca_coll_han_gather_ug_task(void *task_args); + /* only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_gather_argu(mca_gather_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) +static inline void +mca_coll_han_set_gather_args(mca_coll_han_gather_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, bool is_mapbycore, ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root = root; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->is_mapbycore = is_mapbycore; + args->req = req; } int @@ -61,50 +66,56 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int i; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; int w_rank, w_size; /* information about the global communicator */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ char *reorder_buf = NULL, *reorder_rbuf = NULL; - ptrdiff_t rsize, rgap = 0, rextent; - int *vranks, low_rank, low_size; - int * topo; - + int i, err, *vranks, low_rank, low_size, *topo; ompi_request_t *temp_request = NULL; - w_rank = ompi_comm_rank(comm); - w_size = ompi_comm_size(comm); /* Create the subcommunicators */ - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + err = mca_coll_han_comm_create(comm, han_module); + if( OMPI_SUCCESS != err ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ topo = mca_coll_han_topo_init(comm, han_module, 2); - - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); } + w_rank = ompi_comm_rank(comm); + w_size = ompi_comm_size(comm); + /* Set up request */ temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; /* create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_gather_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_gather_up_module]; /* Get the 'virtual ranks' mapping correspondong to the communicators */ vranks = han_module->cached_vranks; @@ -115,10 +126,9 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", - w_rank, root, root_low_rank, root_up_rank)); + "[%d]: Han Gather root %d root_low_rank %d root_up_rank %d\n", + w_rank, root, root_low_rank, root_up_rank)); - ompi_datatype_type_extent(rdtype, &rextent); /* Allocate reorder buffers */ if (w_rank == root) { @@ -127,17 +137,30 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, * in a increasing order for both patterns */ if (han_module->is_mapbycore) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Han Gather is_bycore: ", w_rank)); + "[%d]: Han Gather is_bycore: ", w_rank)); reorder_rbuf = (char *)rbuf; } else { /* Need a buffer to store unordered final result */ + ptrdiff_t rsize, rgap; rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * w_size, &rgap); reorder_buf = (char *)malloc(rsize); //TODO:free /* rgap is the size of unused space at the start of the datatype */ reorder_rbuf = reorder_buf - rgap; + + if (MPI_IN_PLACE == sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)rcount; + ptrdiff_t src_shift = block_size * w_rank; + ptrdiff_t dest_shift = block_size * w_rank; + ompi_datatype_copy_content_same_ddt(rdtype, + (ptrdiff_t)rcount, + (char *)rbuf + dest_shift, + reorder_rbuf + src_shift); + } } } @@ -145,12 +168,12 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, /* Create lg task */ mca_coll_task_t *lg = OBJ_NEW(mca_coll_task_t); /* Setup lg task arguments */ - mca_gather_argu_t *lg_argu = malloc(sizeof(mca_gather_argu_t)); - mac_coll_han_set_gather_argu(lg_argu, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, + mca_coll_han_gather_args_t *lg_args = malloc(sizeof(mca_coll_han_gather_args_t)); + mca_coll_han_set_gather_args(lg_args, lg, (char *) sbuf, NULL, scount, sdtype, reorder_rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, up_comm, - low_comm, w_rank, low_rank != root_low_rank, temp_request); + low_comm, w_rank, low_rank != root_low_rank, han_module->is_mapbycore, temp_request); /* Init lg task */ - init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_argu)); + init_task(lg, mca_coll_han_gather_lg_task, (void *) (lg_args)); /* Issure lg task */ issue_task(lg); @@ -166,19 +189,21 @@ mca_coll_han_gather_intra(const void *sbuf, int scount, */ /* reorder rbuf based on rank */ if (w_rank == root && !han_module->is_mapbycore) { + ptrdiff_t rextent; + ompi_datatype_type_extent(rdtype, &rextent); for (i=0; iw_rank)); - OBJ_RELEASE(t->cur_task); + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } /* If the process is one of the node leader */ char *tmp_buf = NULL; @@ -201,33 +234,45 @@ int mca_coll_han_gather_lg_task(void *task_argu) /* if the process is one of the node leader, allocate the intermediary * buffer to gather on the low sub communicator */ int low_size = ompi_comm_size(t->low_comm); + int low_rank = ompi_comm_rank(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, - (int64_t)t->rcount * low_size, - &rgap); + rsize = opal_datatype_span(&dtype->super, + count * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_rbuf = tmp_buf - rgap; + if (t->w_rank == t->root) { + if (MPI_IN_PLACE == t->sbuf) { + ptrdiff_t rextent; + ompi_datatype_type_extent(dtype, &rextent); + ptrdiff_t block_size = rextent * (ptrdiff_t)count; + ptrdiff_t src_shift = block_size * t->w_rank; + ptrdiff_t dest_shift = block_size * low_rank; + ompi_datatype_copy_content_same_ddt(dtype, + (ptrdiff_t)count, + tmp_rbuf + dest_shift, + (char *)t->rbuf + src_shift); + } + } } - /* shared memory node gather */ + /* Low level (usually intra-node or shared memory) node gather */ t->low_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount, - t->sdtype, - tmp_rbuf, - t->rcount, - t->rdtype, - t->root_low_rank, - t->low_comm, - t->low_comm->c_coll->coll_gather_module); + count, + dtype, + tmp_rbuf, + count, + dtype, + t->root_low_rank, + t->low_comm, + t->low_comm->c_coll->coll_gather_module); /* Prepare up comm gather */ t->sbuf = tmp_rbuf; t->sbuf_inter_free = tmp_buf; /* Create ug (upper level all-gather) task */ - mca_coll_task_t *ug = OBJ_NEW(mca_coll_task_t); - /* Setup ug task arguments */ - t->cur_task = ug; + mca_coll_task_t *ug = t->cur_task; /* Init ug task */ init_task(ug, mca_coll_han_gather_ug_task, (void *) t); /* Issure ug task */ @@ -237,26 +282,37 @@ int mca_coll_han_gather_lg_task(void *task_argu) } /* ug: upper level (intra-node) gather task */ -int mca_coll_han_gather_ug_task(void *task_argu) +int mca_coll_han_gather_ug_task(void *task_args) { - mca_gather_argu_t *t = (mca_gather_argu_t *) task_argu; + mca_coll_han_gather_args_t *t = (mca_coll_han_gather_args_t *) task_args; OBJ_RELEASE(t->cur_task); if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Gather: ug noop\n", t->w_rank)); } else { + ompi_datatype_t *dtype; + size_t count; + if (t->w_rank == t->root) { + dtype = t->rdtype; + count = t->rcount; + } else { + dtype = t->sdtype; + count = t->scount; + } + + int low_size = ompi_comm_size(t->low_comm); /* inter node gather */ t->up_comm->c_coll->coll_gather((char *)t->sbuf, - t->scount*low_size, - t->sdtype, - (char *)t->rbuf, - t->rcount*low_size, - t->rdtype, - t->root_up_rank, - t->up_comm, - t->up_comm->c_coll->coll_gather_module); + count*low_size, + dtype, + (char *)t->rbuf, + count*low_size, + dtype, + t->root_up_rank, + t->up_comm, + t->up_comm->c_coll->coll_gather_module); if (t->sbuf_inter_free != NULL) { free(t->sbuf_inter_free); @@ -274,36 +330,56 @@ int mca_coll_han_gather_ug_task(void *task_argu) /* only work with regular situation (each node has equal number of processes) */ int mca_coll_han_gather_intra_simple(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { - int w_rank = ompi_comm_rank(comm); + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + int *topo, w_rank = ompi_comm_rank(comm); int w_size = ompi_comm_size(comm); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle gather with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ - int *topo = mca_coll_han_topo_init(comm, han_module, 2); - - /* Here root needs to reach all nodes on up_comm. - * But in case of unbalance some up_comms are smaller, - * as the comm_split is made on the base of low_rank */ + topo = mca_coll_han_topo_init(comm, han_module, 2); if (han_module->are_ppn_imbalanced){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle gather with this communicator. It need to fall back on another component\n")); - return han_module->previous_gather(sbuf, scount, sdtype, rbuf, - rcount, rdtype, root, - comm, han_module->previous_gather_module); + "han cannot handle gather with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, gather); + return comm->c_coll->coll_gather(sbuf, scount, sdtype, rbuf, + rcount, rdtype, root, + comm, comm->c_coll->coll_gather_module); } - /* create the subcommunicators */ - mca_coll_han_comm_create_new(comm, han_module); ompi_communicator_t *low_comm = han_module->sub_comm[INTRA_NODE]; ompi_communicator_t *up_comm = han_module->sub_comm[INTER_NODE]; + ompi_datatype_t *dtype; + size_t count; + + if (w_rank == root) { + dtype = rdtype; + count = rcount; + } else { + dtype = sdtype; + count = scount; + } + /* Get the 'virtual ranks' mapping corresponding to the communicators */ int *vranks = han_module->cached_vranks; @@ -325,11 +401,11 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, reorder_buf_start = (char *)rbuf; } else { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d]: Future Gather needs reordering: ", w_rank)); + "[%d]: Future Gather needs reordering: ", w_rank)); ptrdiff_t rgap = 0; ptrdiff_t rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * w_size, - &rgap); + (int64_t)rcount * w_size, + &rgap); reorder_buf = (char *)malloc(rsize); /* rgap is the size of unused space at the start of the datatype */ reorder_buf_start = reorder_buf - rgap; @@ -338,40 +414,40 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, } /* allocate the intermediary buffer - * * to gather on leaders on the low sub communicator */ + * to gather on leaders on the low sub communicator */ char *tmp_buf = NULL; // allocated memory char *tmp_buf_start = NULL; // start of the data if (low_rank == root_low_rank) { - ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&rdtype->super, - (int64_t)rcount * low_size, - &rgap); + ptrdiff_t rsize, rgap = 0; + rsize = opal_datatype_span(&dtype->super, + count * low_size, + &rgap); tmp_buf = (char *) malloc(rsize); tmp_buf_start = tmp_buf - rgap; } /* 1. low gather on nodes leaders */ low_comm->c_coll->coll_gather((char *)sbuf, - scount, - sdtype, - tmp_buf_start, - rcount, - rdtype, - root_low_rank, - low_comm, - low_comm->c_coll->coll_gather_module); + count, + dtype, + tmp_buf_start, + count, + dtype, + root_low_rank, + low_comm, + low_comm->c_coll->coll_gather_module); /* 2. upper gather (inter-node) between node leaders */ if (low_rank == root_low_rank) { up_comm->c_coll->coll_gather((char *)tmp_buf_start, - scount*low_size, - sdtype, - (char *)reorder_buf_start, - rcount*low_size, - rdtype, - root_up_rank, - up_comm, - up_comm->c_coll->coll_gather_module); + count*low_size, + dtype, + (char *)reorder_buf_start, + count*low_size, + dtype, + root_up_rank, + up_comm, + up_comm->c_coll->coll_gather_module); if (tmp_buf != NULL) { free(tmp_buf); @@ -379,7 +455,7 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, tmp_buf_start = NULL; } OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] Future Gather: ug gather finish\n", t->w_rank)); + "[%d] Future Gather: ug gather finish\n", w_rank)); } /* 3. reorder data on root into rbuf @@ -388,8 +464,8 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, */ if (w_rank == root && !han_module->is_mapbycore) { ompi_coll_han_reorder_gather(reorder_buf_start, - rbuf, rcount, rdtype, - comm, topo); + rbuf, rcount, rdtype, + comm, topo); free(reorder_buf); } @@ -408,28 +484,28 @@ mca_coll_han_gather_intra_simple(const void *sbuf, int scount, */ void ompi_coll_han_reorder_gather(const void *sbuf, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - struct ompi_communicator_t *comm, - int * topo) { - int i; - int topolevel = 2; // always 2 levels in topo - int w_rank = ompi_comm_rank(comm); - int w_size = ompi_comm_size(comm); - ptrdiff_t rextent; - ompi_datatype_type_extent(rdtype, &rextent); - for (i=0; ifallback.NAME.NAME = NULL; \ + (HANDLE)->fallback.NAME.module = NULL; \ + } while (0) + /* * Module constructor */ static void han_module_clear(mca_coll_han_module_t *han_module) { - int i; + CLEAN_PREV_COLL(han_module, allgather); + CLEAN_PREV_COLL(han_module, allgatherv); + CLEAN_PREV_COLL(han_module, allreduce); + CLEAN_PREV_COLL(han_module, bcast); + CLEAN_PREV_COLL(han_module, reduce); + CLEAN_PREV_COLL(han_module, gather); + CLEAN_PREV_COLL(han_module, scatter); - for (i = 0; i < COLLCOUNT; i++) { - /* - * Since the previous routines function pointers are declared as - * a union, initializing the dummy routineis enough - */ - han_module->previous_routines[i].previous_routine.dummy = NULL; - han_module->previous_routines[i].previous_module = NULL; - } han_module->reproducible_reduce = NULL; han_module->reproducible_reduce_module = NULL; han_module->reproducible_allreduce = NULL; @@ -50,19 +54,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) { int i; - module->enabled = false; + module->enabled = true; module->super.coll_module_disable = mca_coll_han_module_disable; - module->cached_comm = NULL; module->cached_low_comms = NULL; module->cached_up_comms = NULL; module->cached_vranks = NULL; module->cached_topo = NULL; module->is_mapbycore = false; module->storage_initialized = false; - for (i = 0 ; i < NB_TOPO_LVL ; i++) { + for( i = 0; i < NB_TOPO_LVL; i++ ) { module->sub_comm[i] = NULL; } - for (i=SELF ; imodules_storage.modules[i].module_handler = NULL; } @@ -72,16 +75,18 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module) } -#define OBJ_RELEASE_IF_NOT_NULL(obj) do { \ - if (NULL != (obj)) { \ - OBJ_RELEASE(obj); \ - } \ -} while (0) +#define OBJ_RELEASE_IF_NOT_NULL(obj) \ + do { \ + if (NULL != (obj)) { \ + OBJ_RELEASE(obj); \ + } \ + } while (0) /* * Module destructor */ -static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) +static void +mca_coll_han_module_destruct(mca_coll_han_module_t * module) { int i; @@ -126,7 +131,6 @@ static void mca_coll_han_module_destruct(mca_coll_han_module_t * module) han_module_clear(module); } - OBJ_CLASS_INSTANCE(mca_coll_han_module_t, mca_coll_base_module_t, mca_coll_han_module_construct, @@ -155,6 +159,8 @@ int mca_coll_han_init_query(bool enable_progress_threads, mca_coll_base_module_t * mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) { + int flag; + char info_val[OPAL_MAX_INFO_VAL+1]; mca_coll_han_module_t *han_module; /* @@ -172,7 +178,13 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) comm->c_contextid, comm->c_name); return NULL; } - + if( !ompi_group_have_remote_peers(comm->c_local_group) ) { + /* The group only contains local processes. Disable HAN for now */ + opal_output_verbose(10, ompi_coll_base_framework.framework_output, + "coll:han:comm_query (%d/%s): comm has only local processes; disqualifying myself", + comm->c_contextid, comm->c_name); + return NULL; + } /* Get the priority level attached to this module. If priority is less * than or equal to 0, then the module is unavailable. */ *priority = mca_coll_han_component.han_priority; @@ -189,52 +201,46 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) } /* All is good -- return a module */ - han_module->topologic_level = mca_coll_han_component.topo_level; + han_module->topologic_level = GLOBAL_COMMUNICATOR; + + if (NULL != comm->super.s_info) { + /* Get the info value disaqualifying coll components */ + opal_info_get(comm->super.s_info, "ompi_comm_coll_han_topo_level", + sizeof(info_val), info_val, &flag); + + if (flag) { + if (0 == strcmp(info_val, "INTER_NODE")) { + han_module->topologic_level = INTER_NODE; + } else { + han_module->topologic_level = INTRA_NODE; + } + } + } + + han_module->super.coll_module_enable = han_module_enable; + han_module->super.ft_event = NULL; + han_module->super.coll_alltoall = NULL; + han_module->super.coll_alltoallv = NULL; + han_module->super.coll_alltoallw = NULL; + han_module->super.coll_barrier = NULL; + han_module->super.coll_exscan = NULL; + han_module->super.coll_gatherv = NULL; + han_module->super.coll_reduce_scatter = NULL; + han_module->super.coll_scan = NULL; + han_module->super.coll_scatterv = NULL; + han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; + han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; + han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; + han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; + han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; + han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; - /* - * TODO: When the selector is fully implemented, - * this if will be meaningless - */ if (GLOBAL_COMMUNICATOR == han_module->topologic_level) { /* We are on the global communicator, return topological algorithms */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; han_module->super.coll_allgatherv = NULL; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; - han_module->super.coll_scatterv = NULL; } else { /* We are on a topologic sub-communicator, return only the selector */ - han_module->super.coll_module_enable = han_module_enable; - han_module->super.ft_event = NULL; - han_module->super.coll_allgather = mca_coll_han_allgather_intra_dynamic; han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic; - han_module->super.coll_allreduce = mca_coll_han_allreduce_intra_dynamic; - han_module->super.coll_alltoall = NULL; - han_module->super.coll_alltoallv = NULL; - han_module->super.coll_alltoallw = NULL; - han_module->super.coll_barrier = NULL; - han_module->super.coll_bcast = mca_coll_han_bcast_intra_dynamic; - han_module->super.coll_exscan = NULL; - han_module->super.coll_gather = mca_coll_han_gather_intra_dynamic; - han_module->super.coll_gatherv = NULL; - han_module->super.coll_reduce = mca_coll_han_reduce_intra_dynamic; - han_module->super.coll_reduce_scatter = NULL; - han_module->super.coll_scan = NULL; - han_module->super.coll_scatter = mca_coll_han_scatter_intra_dynamic; - han_module->super.coll_scatterv = NULL; } opal_output_verbose(10, ompi_coll_base_framework.framework_output, @@ -247,28 +253,28 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) /* * In this macro, the following variables are supposed to have been declared * in the caller: - * . ompi_communicator_t *comm + * . ompi_communicator_t *comm * . mca_coll_han_module_t *han_module - */ -#define HAN_SAVE_PREV_COLL_API(__api) do { \ - han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ - han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module;\ - if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ - opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ - "(%d/%s): no underlying " # __api"; disqualifying myself", \ - comm->c_contextid, comm->c_name); \ - return OMPI_ERROR; \ - } \ - /* TODO add a OBJ_RELEASE at module disabling */ \ - /* + FIXME find why releasing generates memory corruption */ \ - OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ + */ +#define HAN_SAVE_PREV_COLL_API(__api) \ + do { \ + if (!comm->c_coll->coll_ ## __api || !comm->c_coll->coll_ ## __api ## _module) { \ + opal_output_verbose(1, ompi_coll_base_framework.framework_output, \ + "(%d/%s): no underlying " # __api"; disqualifying myself", \ + comm->c_contextid, comm->c_name); \ + goto handle_error; \ + } \ + han_module->previous_ ## __api = comm->c_coll->coll_ ## __api; \ + han_module->previous_ ## __api ## _module = comm->c_coll->coll_ ## __api ## _module; \ + OBJ_RETAIN(han_module->previous_ ## __api ## _module); \ } while(0) /* * Init module on the communicator */ -static int han_module_enable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) +static int +han_module_enable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { mca_coll_han_module_t * han_module = (mca_coll_han_module_t*) module; @@ -285,13 +291,25 @@ static int han_module_enable(mca_coll_base_module_t * module, mca_coll_han_allreduce_reproducible_decision(comm, module); return OMPI_SUCCESS; + +handle_error: + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allgatherv_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_allreduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_bcast_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_gather_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_reduce_module); + OBJ_RELEASE_IF_NOT_NULL(han_module->previous_scatter_module); + + return OMPI_ERROR; } /* * Module disable */ -static int mca_coll_han_module_disable(mca_coll_base_module_t * module, - struct ompi_communicator_t *comm) +static int +mca_coll_han_module_disable(mca_coll_base_module_t * module, + struct ompi_communicator_t *comm) { mca_coll_han_module_t * han_module = (mca_coll_han_module_t *) module; diff --git a/ompi/mca/coll/han/coll_han_reduce.c b/ompi/mca/coll/han/coll_han_reduce.c index d0dc337ce8b..03968b6f475 100644 --- a/ompi/mca/coll/han/coll_han_reduce.c +++ b/ompi/mca/coll/han/coll_han_reduce.c @@ -15,33 +15,38 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" -void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, - int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, - int root_up_rank, int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int num_segments, int cur_seg, int w_rank, int last_seg_count, - bool noop) +static int mca_coll_han_reduce_t0_task(void *task_args); +static int mca_coll_han_reduce_t1_task(void *task_args); + +static inline void +mca_coll_han_set_reduce_args(mca_coll_han_reduce_args_t * args, mca_coll_task_t * cur_task, void *sbuf, void *rbuf, + int seg_count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, + int root_up_rank, int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int num_segments, int cur_seg, int w_rank, int last_seg_count, + bool noop, bool is_tmp_rbuf) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->rbuf = rbuf; - argu->seg_count = seg_count; - argu->dtype = dtype; - argu->op = op; - argu->root_low_rank = root_low_rank; - argu->root_up_rank = root_up_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->num_segments = num_segments; - argu->cur_seg = cur_seg; - argu->w_rank = w_rank; - argu->last_seg_count = last_seg_count; - argu->noop = noop; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->rbuf = rbuf; + args->seg_count = seg_count; + args->dtype = dtype; + args->op = op; + args->root_low_rank = root_low_rank; + args->root_up_rank = root_up_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->num_segments = num_segments; + args->cur_seg = cur_seg; + args->w_rank = w_rank; + args->last_seg_count = last_seg_count; + args->noop = noop; + args->is_tmp_rbuf = is_tmp_rbuf; } -/* - * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: +/* + * Each segment of the messsage needs to go though 2 steps to perform MPI_Reduce: * lb: low level (shared-memory or intra-node) reduce. * ub: upper level (inter-node) reduce * Hence, in each iteration, there is a combination of collective operations which is called a task. @@ -53,49 +58,62 @@ void mac_coll_han_set_reduce_argu(mca_reduce_argu_t * argu, mca_coll_task_t * cu * iter 4 | | | | ur | task: t1, contains ur */ int -mca_coll_han_reduce_intra(const void *sbuf, +mca_coll_han_reduce_intra(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, ompi_op_t* op, int root, - struct ompi_communicator_t *comm, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; ptrdiff_t extent, lb; - ompi_datatype_get_extent(dtype, &lb, &extent); - int w_rank; - w_rank = ompi_comm_rank(comm); - int seg_count = count; - size_t typelng; - ompi_datatype_type_size(dtype, &typelng); + int seg_count = count, w_rank; + size_t dtype_size; - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - /* Do not initialize topology if the operation cannot commute */ - if(!ompi_op_is_commute(op)){ + /* No support for non-commutative operations */ + if(!ompi_op_is_commute(op)) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this operation. It needs to fall back on another component\n")); + "han cannot handle reduce with this operation. Fall back on another component\n")); goto prev_reduce_intra; } + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all modules */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); - goto prev_reduce_intra; + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); + ompi_datatype_get_extent(dtype, &lb, &extent); + w_rank = ompi_comm_rank(comm); + ompi_datatype_type_size(dtype, &dtype_size); + ompi_communicator_t *low_comm; ompi_communicator_t *up_comm; /* use MCA parameters for now */ low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; up_comm = han_module->cached_up_comms[mca_coll_han_component.han_reduce_up_module]; - COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, typelng, + COLL_BASE_COMPUTED_SEGCOUNT(mca_coll_han_component.han_reduce_segsize, dtype_size, seg_count); int num_segments = (count + seg_count - 1) / seg_count; @@ -106,6 +124,7 @@ mca_coll_han_reduce_intra(const void *sbuf, int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); + int up_rank = ompi_comm_rank(up_comm); int root_low_rank; int root_up_rank; @@ -114,14 +133,22 @@ mca_coll_han_reduce_intra(const void *sbuf, "[%d]: root_low_rank %d root_up_rank %d\n", w_rank, root_low_rank, root_up_rank)); + void *tmp_rbuf = rbuf; + void *tmp_rbuf_to_free = NULL; + if (low_rank == root_low_rank && root_up_rank != up_rank) { + /* allocate 2 segments on node leaders that are not the global root */ + tmp_rbuf = malloc(2*extent*seg_count); + tmp_rbuf_to_free = tmp_rbuf; + } + /* Create t0 tasks for the first segment */ mca_coll_task_t *t0 = OBJ_NEW(mca_coll_task_t); /* Setup up t0 task arguments */ - mca_reduce_argu_t *t = malloc(sizeof(mca_reduce_argu_t)); - mac_coll_han_set_reduce_argu(t, t0, (char *) sbuf, (char *) rbuf, seg_count, dtype, + mca_coll_han_reduce_args_t *t = malloc(sizeof(mca_coll_han_reduce_args_t)); + mca_coll_han_set_reduce_args(t, t0, (char *) sbuf, (char *) tmp_rbuf, seg_count, dtype, op, root_up_rank, root_low_rank, up_comm, low_comm, num_segments, 0, w_rank, count - (num_segments - 1) * seg_count, - low_rank != root_low_rank); + low_rank != root_low_rank, (NULL != tmp_rbuf_to_free)); /* Init the first task */ init_task(t0, mca_coll_han_reduce_t0_task, (void *) t); issue_task(t0); @@ -140,7 +167,9 @@ mca_coll_han_reduce_intra(const void *sbuf, /* Setup up t1 task arguments */ t->cur_task = t1; t->sbuf = (char *) t->sbuf + extent * t->seg_count; - t->rbuf = (char *) t->rbuf + extent * t->seg_count; + if (up_rank == root_up_rank) { + t->rbuf = (char *) t->rbuf + extent * t->seg_count; + } t->cur_seg = t->cur_seg + 1; /* Init the t1 task */ init_task(t1, mca_coll_han_reduce_t1_task, (void *) t); @@ -148,19 +177,20 @@ mca_coll_han_reduce_intra(const void *sbuf, } free(t); + free(tmp_rbuf_to_free); return OMPI_SUCCESS; -prev_reduce_intra: + prev_reduce_intra: return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, comm, han_module->previous_reduce_module); } /* t0 task: issue and wait for the low level reduce of segment 0 */ -int mca_coll_han_reduce_t0_task(void *task_argu) +int mca_coll_han_reduce_t0_task(void *task_args) { - mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t0 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); @@ -173,41 +203,55 @@ int mca_coll_han_reduce_t0_task(void *task_argu) } /* t1 task */ -int mca_coll_han_reduce_t1_task(void *task_argu) { - mca_reduce_argu_t *t = (mca_reduce_argu_t *) task_argu; +int mca_coll_han_reduce_t1_task(void *task_args) { + mca_coll_han_reduce_args_t *t = (mca_coll_han_reduce_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: in t1 %d\n", t->w_rank, t->cur_seg)); OBJ_RELEASE(t->cur_task); ptrdiff_t extent, lb; + int cur_seg = t->cur_seg; ompi_datatype_get_extent(t->dtype, &lb, &extent); ompi_request_t *ireduce_req = NULL; - int tmp_count = t->seg_count; if (!t->noop) { + int tmp_count = t->seg_count; + if (cur_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { + tmp_count = t->last_seg_count; + } int up_rank = ompi_comm_rank(t->up_comm); /* ur of cur_seg */ if (up_rank == t->root_up_rank) { - t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, t->seg_count, t->dtype, + t->up_comm->c_coll->coll_ireduce(MPI_IN_PLACE, (char *) t->rbuf, tmp_count, t->dtype, t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); } else { - t->up_comm->c_coll->coll_ireduce((char *) t->rbuf, (char *) t->rbuf, t->seg_count, + /* this is a node leader that is not root so alternate between the two allocated segments */ + char *tmp_sbuf = (char*)t->rbuf + (cur_seg % 2)*(extent * t->seg_count); + t->up_comm->c_coll->coll_ireduce(tmp_sbuf, NULL, tmp_count, t->dtype, t->op, t->root_up_rank, t->up_comm, &ireduce_req, t->up_comm->c_coll->coll_ireduce_module); } } /* lr of cur_seg+1 */ - if (t->cur_seg <= t->num_segments - 2) { - if (t->cur_seg == t->num_segments - 2 && t->last_seg_count != t->seg_count) { + int next_seg = cur_seg + 1; + if (next_seg <= t->num_segments - 1) { + int tmp_count = t->seg_count; + char *tmp_rbuf = NULL; + if (next_seg == t->num_segments - 1 && t->last_seg_count != t->seg_count) { tmp_count = t->last_seg_count; } + if (t->is_tmp_rbuf) { + tmp_rbuf = (char*)t->rbuf + (next_seg % 2)*(extent * t->seg_count); + } else if (NULL != t->rbuf) { + tmp_rbuf = (char*)t->rbuf + extent * t->seg_count; + } t->low_comm->c_coll->coll_reduce((char *) t->sbuf + extent * t->seg_count, - (char *) t->rbuf + extent * t->seg_count, tmp_count, + (char *) tmp_rbuf, tmp_count, t->dtype, t->op, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_reduce_module); } if (!t->noop && ireduce_req) { - ompi_request_wait(&ireduce_req, MPI_STATUSES_IGNORE); + ompi_request_wait(&ireduce_req, MPI_STATUS_IGNORE); } return OMPI_SUCCESS; @@ -217,13 +261,13 @@ int mca_coll_han_reduce_t1_task(void *task_argu) { * a fallback is made on the next component that provides a reduce in priority order */ int mca_coll_han_reduce_intra_simple(const void *sbuf, - void* rbuf, - int count, - struct ompi_datatype_t *dtype, - ompi_op_t *op, - int root, - struct ompi_communicator_t *comm, - mca_coll_base_module_t *module) + void* rbuf, + int count, + struct ompi_datatype_t *dtype, + ompi_op_t *op, + int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { int w_rank; /* information about the global communicator */ int root_low_rank, root_up_rank; /* root ranks for both sub-communicators */ @@ -234,23 +278,37 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; - /* Do not initialize topology if the operation cannot commute */ + /* No support for non-commutative operations */ if(!ompi_op_is_commute(op)){ OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this operation. It needs to fall back on another component\n")); - goto prev_reduce_intra_simple; + "han cannot handle reduce with this operation. Fall back on another component\n")); + goto prev_reduce_intra; + } + + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle reduce with this communicator. Drop HAN support in this communicator and fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle reduce with this communicator. It needs to fall back on another component\n")); - goto prev_reduce_intra_simple; + "han cannot handle reduce with this communicator (imbalanced). Drop HAN support in this communicator and fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, reduce); + return comm->c_coll->coll_reduce(sbuf, rbuf, count, dtype, op, root, + comm, comm->c_coll->coll_reduce_module); } - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = han_module->cached_low_comms[mca_coll_han_component.han_reduce_low_module]; ompi_communicator_t *up_comm = @@ -289,7 +347,7 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "HAN/REDUCE: low comm reduce failed. " "Falling back to another component\n")); - goto prev_reduce_intra_simple; + goto prev_reduce_intra; } /* Up_comm reduce */ @@ -315,10 +373,9 @@ mca_coll_han_reduce_intra_simple(const void *sbuf, } return OMPI_SUCCESS; -prev_reduce_intra_simple: + prev_reduce_intra: return han_module->previous_reduce(sbuf, rbuf, count, dtype, op, root, - comm, - han_module->previous_reduce_module); + comm, han_module->previous_reduce_module); } @@ -341,15 +398,14 @@ mca_coll_han_reduce_reproducible_decision(struct ompi_communicator_t *comm, int i; for (i=0; imodules_storage - .modules[fallback] - .module_handler; + mca_coll_base_module_t *fallback_module + = han_module->modules_storage.modules[fallback].module_handler; if (fallback_module != NULL && fallback_module->coll_reduce != NULL) { if (0 == w_rank) { opal_output_verbose(30, mca_coll_han_component.han_output, "coll:han:reduce_reproducible: " "fallback on %s\n", - components_name[fallback]); + available_components[fallback].component_name); } han_module->reproducible_reduce_module = fallback_module; han_module->reproducible_reduce = fallback_module->coll_reduce; diff --git a/ompi/mca/coll/han/coll_han_scatter.c b/ompi/mca/coll/han/coll_han_scatter.c index b2a87529384..c52cc1911ac 100644 --- a/ompi/mca/coll/han/coll_han_scatter.c +++ b/ompi/mca/coll/han/coll_han_scatter.c @@ -15,96 +15,105 @@ #include "ompi/mca/pml/pml.h" #include "coll_han_trigger.h" +static int mca_coll_han_scatter_us_task(void *task_args); +static int mca_coll_han_scatter_ls_task(void *task_args); + /* Only work with regular situation (each node has equal number of processes) */ -void mac_coll_han_set_scatter_argu(mca_scatter_argu_t * argu, - mca_coll_task_t * cur_task, - void *sbuf, - void *sbuf_inter_free, - void *sbuf_reorder_free, - int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, - int rcount, - struct ompi_datatype_t *rdtype, - int root, - int root_up_rank, - int root_low_rank, - struct ompi_communicator_t *up_comm, - struct ompi_communicator_t *low_comm, - int w_rank, bool noop, ompi_request_t * req) +static inline void +mca_coll_han_set_scatter_args(mca_coll_han_scatter_args_t * args, + mca_coll_task_t * cur_task, + void *sbuf, + void *sbuf_inter_free, + void *sbuf_reorder_free, + int scount, + struct ompi_datatype_t *sdtype, + void *rbuf, + int rcount, + struct ompi_datatype_t *rdtype, + int root, + int root_up_rank, + int root_low_rank, + struct ompi_communicator_t *up_comm, + struct ompi_communicator_t *low_comm, + int w_rank, bool noop, ompi_request_t * req) { - argu->cur_task = cur_task; - argu->sbuf = sbuf; - argu->sbuf_inter_free = sbuf_inter_free; - argu->sbuf_reorder_free = sbuf_reorder_free; - argu->scount = scount; - argu->sdtype = sdtype; - argu->rbuf = rbuf; - argu->rcount = rcount; - argu->rdtype = rdtype; - argu->root = root; - argu->root_up_rank = root_up_rank; - argu->root_low_rank = root_low_rank; - argu->up_comm = up_comm; - argu->low_comm = low_comm; - argu->w_rank = w_rank; - argu->noop = noop; - argu->req = req; + args->cur_task = cur_task; + args->sbuf = sbuf; + args->sbuf_inter_free = sbuf_inter_free; + args->sbuf_reorder_free = sbuf_reorder_free; + args->scount = scount; + args->sdtype = sdtype; + args->rbuf = rbuf; + args->rcount = rcount; + args->rdtype = rdtype; + args->root = root; + args->root_up_rank = root_up_rank; + args->root_low_rank = root_low_rank; + args->up_comm = up_comm; + args->low_comm = low_comm; + args->w_rank = w_rank; + args->noop = noop; + args->req = req; } int mca_coll_han_scatter_intra(const void *sbuf, int scount, - struct ompi_datatype_t *sdtype, - void *rbuf, int rcount, - struct ompi_datatype_t *rdtype, - int root, - struct ompi_communicator_t *comm, mca_coll_base_module_t * module) + struct ompi_datatype_t *sdtype, + void *rbuf, int rcount, + struct ompi_datatype_t *rdtype, + int root, + struct ompi_communicator_t *comm, mca_coll_base_module_t * module) { - int i, j; - int w_rank, w_size; + mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; + int i, j, w_rank, w_size; w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - mca_coll_han_module_t *han_module = (mca_coll_han_module_t *) module; - int *topo = mca_coll_han_topo_init(comm, han_module, 2); + /* Create the subcommunicators */ + if( OMPI_SUCCESS != mca_coll_han_comm_create(comm, han_module) ) { /* Let's hope the error is consistently returned across the entire communicator */ + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, + "han cannot handle scatter with this communicator. Fall back on another component\n")); + /* HAN cannot work with this communicator so fallback on all collectives */ + HAN_LOAD_FALLBACK_COLLECTIVES(han_module, comm); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); + } + /* Topo must be initialized to know rank distribution which then is used to * determine if han can be used */ - mca_coll_han_topo_init(comm, han_module, 2); - if (han_module->are_ppn_imbalanced){ + int* topo = mca_coll_han_topo_init(comm, han_module, 2); + if (han_module->are_ppn_imbalanced) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "han cannot handle scatter with this communicator. It needs to fall back on another component\n")); - goto prev_scatter_intra; + "han cannot handle scatter with this communicator (imbalance). Fall back on another component\n")); + /* Put back the fallback collective support and call it once. All + * future calls will then be automatically redirected. + */ + HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatter); + return comm->c_coll->coll_scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, + comm, comm->c_coll->coll_scatter_module); } - /* Create the subcommunicators */ - mca_coll_han_comm_create(comm, han_module); ompi_communicator_t *low_comm = - han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; + han_module->cached_low_comms[mca_coll_han_component.han_scatter_low_module]; ompi_communicator_t *up_comm = - han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; + han_module->cached_up_comms[mca_coll_han_component.han_scatter_up_module]; int *vranks = han_module->cached_vranks; int low_rank = ompi_comm_rank(low_comm); int low_size = ompi_comm_size(low_comm); int up_size = ompi_comm_size(up_comm); - ompi_request_t *temp_request = NULL; /* Set up request */ - temp_request = OBJ_NEW(ompi_request_t); - OMPI_REQUEST_INIT(temp_request, false); + ompi_request_t *temp_request = OBJ_NEW(ompi_request_t); temp_request->req_state = OMPI_REQUEST_ACTIVE; - temp_request->req_type = 0; + temp_request->req_type = OMPI_REQUEST_COLL; temp_request->req_free = han_request_free; - temp_request->req_status.MPI_SOURCE = 0; - temp_request->req_status.MPI_TAG = 0; - temp_request->req_status.MPI_ERROR = 0; - temp_request->req_status._cancelled = 0; - temp_request->req_status._ucount = 0; + temp_request->req_status = (ompi_status_public_t){0}; + temp_request->req_complete = REQUEST_PENDING; int root_low_rank; int root_up_rank; - mca_coll_han_get_ranks(vranks, root, low_size, &root_low_rank, &root_up_rank); OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter root %d root_low_rank %d root_up_rank %d\n", w_rank, @@ -149,42 +158,55 @@ mca_coll_han_scatter_intra(const void *sbuf, int scount, } } + + void *dest_buf = rbuf; + int dest_count = rcount; + ompi_datatype_t *dest_dtype = rdtype; + if (MPI_IN_PLACE == rbuf) { + dest_buf = (void*)sbuf; + dest_count = scount; + dest_dtype = sdtype; + } + /* Create us task */ mca_coll_task_t *us = OBJ_NEW(mca_coll_task_t); /* Setup us task arguments */ - mca_scatter_argu_t *us_argu = malloc(sizeof(mca_scatter_argu_t)); - mac_coll_han_set_scatter_argu(us_argu, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, - (char *) rbuf, rcount, rdtype, root, root_up_rank, root_low_rank, + mca_coll_han_scatter_args_t *us_args = malloc(sizeof(mca_coll_han_scatter_args_t)); + mca_coll_han_set_scatter_args(us_args, us, reorder_sbuf, NULL, reorder_buf, scount, sdtype, + (char *) dest_buf, dest_count, dest_dtype, root, root_up_rank, root_low_rank, up_comm, low_comm, w_rank, low_rank != root_low_rank, temp_request); /* Init us task */ - init_task(us, mca_coll_han_scatter_us_task, (void *) (us_argu)); + init_task(us, mca_coll_han_scatter_us_task, (void *) (us_args)); /* Issure us task */ issue_task(us); ompi_request_wait(&temp_request, MPI_STATUS_IGNORE); return OMPI_SUCCESS; -prev_scatter_intra: - return han_module->previous_scatter(sbuf, scount, sdtype, - rbuf, rcount, rdtype, - root, comm, - han_module->previous_scatter_module); } /* us: upper level (intra-node) scatter task */ -int mca_coll_han_scatter_us_task(void *task_argu) +int mca_coll_han_scatter_us_task(void *task_args) { - mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; - OBJ_RELEASE(t->cur_task); + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; if (t->noop) { OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: us noop\n", t->w_rank)); } else { + size_t count; + ompi_datatype_t *dtype; + if (t->w_rank == t->root) { + dtype = t->sdtype; + count = t->scount; + } else { + dtype = t->rdtype; + count = t->rcount; + } int low_size = ompi_comm_size(t->low_comm); ptrdiff_t rsize, rgap = 0; - rsize = opal_datatype_span(&t->rdtype->super, (int64_t) t->rcount * low_size, &rgap); + rsize = opal_datatype_span(&dtype->super, (int64_t) count * low_size, &rgap); char *tmp_buf = (char *) malloc(rsize); char *tmp_rbuf = tmp_buf - rgap; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, @@ -202,9 +224,7 @@ int mca_coll_han_scatter_us_task(void *task_argu) t->sbuf_reorder_free = NULL; } /* Create ls tasks for the current union segment */ - mca_coll_task_t *ls = OBJ_NEW(mca_coll_task_t); - /* Setup up ls task arguments */ - t->cur_task = ls; + mca_coll_task_t *ls = t->cur_task; /* Init ls task */ init_task(ls, mca_coll_han_scatter_ls_task, (void *) t); /* Issure ls task */ @@ -213,14 +233,14 @@ int mca_coll_han_scatter_us_task(void *task_argu) return OMPI_SUCCESS; } -/* ls: lower level (shared memory) scatter task */ -int mca_coll_han_scatter_ls_task(void *task_argu) +/* ls: lower level (shared memory or intra-node) scatter task */ +int mca_coll_han_scatter_ls_task(void *task_args) { - mca_scatter_argu_t *t = (mca_scatter_argu_t *) task_argu; + mca_coll_han_scatter_args_t *t = (mca_coll_han_scatter_args_t *) task_args; OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d] Han Scatter: ls\n", t->w_rank)); OBJ_RELEASE(t->cur_task); - /* Shared memory scatter */ + t->low_comm->c_coll->coll_scatter((char *) t->sbuf, t->scount, t->sdtype, (char *) t->rbuf, t->rcount, t->rdtype, t->root_low_rank, t->low_comm, t->low_comm->c_coll->coll_scatter_module); diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index e99f3e614b8..bf5b4df523b 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -26,157 +26,100 @@ #include "coll_han.h" #include "coll_han_dynamic.h" - -/* - * Local functions - */ -static void create_intranode_comm_new(ompi_communicator_t *, - ompi_communicator_t **); -static void create_internode_comm_new(ompi_communicator_t *, - int, int, - ompi_communicator_t **); -static void create_intranode_comm(ompi_communicator_t *, - const char *, - int, - ompi_communicator_t **); -static void create_internode_comm(ompi_communicator_t *, - const char *, - int, int, - ompi_communicator_t **); - -/** - * Create a sub-communicator containing the ranks that share my node. - * - * @param comm (IN) original communicator for the collective - * target module priority - * @param sub_comm (OUT) created sub-communicator - */ -static void create_intranode_comm_new(ompi_communicator_t *comm, - ompi_communicator_t **sub_comm) -{ - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, - (opal_info_t *)(&ompi_mpi_info_null), sub_comm); - return; -} - -/** - * Create a sub-communicator containing one rank per node. - * - * @param comm (IN) original communicator for the collective - * @param my_rank (IN) my rank in comm - * @param intra_rank (IN) local rank in the intra-node sub-communicator - * @param sub_comm (OUT) created sub-communicator - */ -static void create_internode_comm_new(ompi_communicator_t *comm, - int my_rank, - int intra_rank, - ompi_communicator_t **sub_comm) -{ - ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); - return; -} +#define HAN_SUBCOM_SAVE_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (FALLBACKS).COLL.COLL = (COMM)->c_coll->coll_ ## COLL; \ + (FALLBACKS).COLL.module = (COMM)->c_coll->coll_ ## COLL ## _module; \ + (COMM)->c_coll->coll_ ## COLL = (HANM)->fallback.COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (HANM)->fallback.COLL.module; \ + } while(0) + +#define HAN_SUBCOM_LOAD_COLLECTIVE(FALLBACKS, COMM, HANM, COLL) \ + do { \ + (COMM)->c_coll->coll_ ## COLL = (FALLBACKS).COLL.COLL; \ + (COMM)->c_coll->coll_ ## COLL ## _module = (FALLBACKS).COLL.module; \ + } while(0) /* * Routine that creates the local hierarchical sub-communicators * Called each time a collective is called. * comm: input communicator of the collective */ -void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, +int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, mca_coll_han_module_t *han_module) { - int low_rank, low_size; - int up_rank; - int w_rank; - int w_size; + int low_rank, low_size, up_rank, w_rank, w_size; ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]); ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]); - const int *origin_priority; - int han_var_id; - int tmp_han_priority; + mca_coll_han_collectives_fallback_t fallbacks; int vrank, *vranks; - - mca_coll_base_module_allreduce_fn_t old_allreduce; - mca_coll_base_module_t *old_allreduce_module; - - mca_coll_base_module_allgather_fn_t old_allgather; - mca_coll_base_module_t *old_allgather_module; - - mca_coll_base_module_bcast_fn_t old_bcast; - mca_coll_base_module_t *old_bcast_module; - - mca_coll_base_module_gather_fn_t old_gather; - mca_coll_base_module_t *old_gather_module; - - mca_coll_base_module_reduce_fn_t old_reduce; - mca_coll_base_module_t *old_reduce_module; + opal_info_t comm_info; /* The sub communicators have already been created */ - if (NULL != han_module->sub_comm[INTRA_NODE] + if (han_module->enabled && NULL != han_module->sub_comm[INTRA_NODE] && NULL != han_module->sub_comm[INTER_NODE] && NULL != han_module->cached_vranks) { - return; + return OMPI_SUCCESS; } /* - * We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. * * Allgather is used to compute vranks * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new * Reduce + Bcast may be called by the allreduce implementation * Gather + Bcast may be called by the allgather implementation */ - old_allreduce = comm->c_coll->coll_allreduce; - old_allreduce_module = comm->c_coll->coll_allreduce_module; - - old_allgather = comm->c_coll->coll_allgather; - old_allgather_module = comm->c_coll->coll_allgather_module; - - old_reduce = comm->c_coll->coll_reduce; - old_reduce_module = comm->c_coll->coll_reduce_module; - - old_bcast = comm->c_coll->coll_bcast; - old_bcast_module = comm->c_coll->coll_bcast_module; - - old_gather = comm->c_coll->coll_gather; - old_gather_module = comm->c_coll->coll_gather_module; - - comm->c_coll->coll_allreduce = han_module->previous_allreduce; - comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; - - comm->c_coll->coll_allgather = han_module->previous_allgather; - comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; - - comm->c_coll->coll_reduce = han_module->previous_reduce; - comm->c_coll->coll_reduce_module = han_module->previous_reduce_module; - - comm->c_coll->coll_bcast = han_module->previous_bcast; - comm->c_coll->coll_bcast_module = han_module->previous_bcast_module; + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); + + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } - comm->c_coll->coll_gather = han_module->previous_gather; - comm->c_coll->coll_gather_module = han_module->previous_gather_module; + OBJ_CONSTRUCT(&comm_info, opal_info_t); /* Create topological sub-communicators */ w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); - origin_priority = NULL; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - - /* - * Maximum priority for selector on sub-communicators - */ - tmp_han_priority = 100; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - /* * This sub-communicator contains the ranks that share my node. */ - mca_coll_han_component.topo_level = INTRA_NODE; - create_intranode_comm_new(comm, low_comm); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "han"); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, low_comm); /* * Get my local rank and the local size @@ -188,8 +131,8 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, * This sub-communicator contains one process per node: processes with the * same intra-node rank id share such a sub-communicator */ - mca_coll_han_component.topo_level = INTER_NODE; - create_internode_comm_new(comm, w_rank, low_rank, up_comm); + opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false); up_rank = ompi_comm_rank(*up_comm); @@ -208,216 +151,116 @@ void mca_coll_han_comm_create_new(struct ompi_communicator_t *comm, * vrank */ comm->c_coll->coll_allgather(&vrank, - 1, - MPI_INT, - vranks, - 1, - MPI_INT, - comm, - comm->c_coll->coll_allgather_module); + 1, + MPI_INT, + vranks, + 1, + MPI_INT, + comm, + comm->c_coll->coll_allgather_module); /* * Set the cached info */ han_module->cached_vranks = vranks; - /* - * Come back to the original han module priority - */ - mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* Put allreduce, allgather, reduce and bcast back */ - comm->c_coll->coll_allreduce = old_allreduce; - comm->c_coll->coll_allreduce_module = old_allreduce_module; - - comm->c_coll->coll_allgather = old_allgather; - comm->c_coll->coll_allgather_module = old_allgather_module; - - comm->c_coll->coll_reduce = old_reduce; - comm->c_coll->coll_reduce_module = old_reduce_module; - - comm->c_coll->coll_bcast = old_bcast; - comm->c_coll->coll_bcast_module = old_bcast_module; - - comm->c_coll->coll_gather = old_gather; - comm->c_coll->coll_gather_module = old_gather_module; - - mca_coll_han_component.topo_level = GLOBAL_COMMUNICATOR; + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; } -/** - * Create a sub-communicator containing the ranks that share my node. - * Associate this sub-communicator a given collective module. - * module can be one of: - * . sm - * . shared - * - * @param comm (IN) original communicator for the collective - * @param prio_string (IN) string containing the mca variable associated to - * target module priority - * @param my_rank (IN) my rank in comm - * @param sub_comm (OUT) created sub-communicator - */ -static void create_intranode_comm(ompi_communicator_t *comm, - const char *prio_string, - int my_rank, - ompi_communicator_t **sub_comm) -{ - int var_id; - const int *sav_priority; - int tmp_priority = 100; - - /* - * Upgrade the target module priority to make the resulting sub-communicator - * use that collective module - */ - mca_base_var_find_by_name(prio_string, &var_id); - mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] %s origin %d\n", - my_rank, prio_string, *sav_priority)); - - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - /* - * Create the sub-communicator - * Since the target priority has been set to the highest value, this - * sub-communicator will inherit it as a collective module. - */ - ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, - (opal_info_t *)(&ompi_mpi_info_null), sub_comm); - /* - * Come back to the target module's original priority - */ - mca_base_var_set_value(var_id, sav_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - return; -} - -/** - * Create a sub-communicator containing one rank per node. - * Associate this sub-communicator a given collective module. - * module can be one of: - * . libnbc - * . adapt - * - * @param comm (IN) original communicator for the collective - * @param prio_string (IN) string containing the mca variable associated to - * target module priority - * @param my_rank (IN) my rank in comm - * @param intra_rank (IN) local rank in the intra-node sub-communicator - * @param sub_comm (OUT) created sub-communicator - */ -static void create_internode_comm(ompi_communicator_t *comm, - const char *prio_string, - int my_rank, - int intra_rank, - ompi_communicator_t **sub_comm) -{ - int var_id; - const int *sav_priority; - int tmp_priority = 100; - - /* - * Upgrade the target module priority to make the resulting sub-communicator - * use that collective module - */ - mca_base_var_find_by_name(prio_string, &var_id); - mca_base_var_get_value(var_id, &sav_priority, NULL, NULL); - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, - "[%d] %s origin %d\n", my_rank, prio_string, - *sav_priority)); - mca_base_var_set_flag(var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(var_id, &tmp_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* - * Create the sub-communicator - * Since the target priority has been set to the highest value, this - * sub-communicator will inherit it as a collective module. - */ - ompi_comm_split(comm, intra_rank, my_rank, sub_comm, false); - mca_base_var_set_value(var_id, sav_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - return; -} - - /* * Routine that creates the local hierarchical sub-communicators * Called each time a collective is called. * comm: input communicator of the collective */ -void mca_coll_han_comm_create(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module) +int mca_coll_han_comm_create(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module) { - int low_rank, low_size; - int up_rank; - int w_rank; - int w_size; + int low_rank, low_size, up_rank, w_rank, w_size; + mca_coll_han_collectives_fallback_t fallbacks; ompi_communicator_t **low_comms; ompi_communicator_t **up_comms; - const int *origin_priority; - int han_var_id; - int tmp_han_priority; int vrank, *vranks; - - mca_coll_base_module_allreduce_fn_t old_allreduce; - mca_coll_base_module_t *old_allreduce_module; - mca_coll_base_module_allgather_fn_t old_allgather; - mca_coll_base_module_t *old_allgather_module; + opal_info_t comm_info; /* use cached communicators if possible */ - if (han_module->cached_comm == comm && - han_module->cached_low_comms != NULL && - han_module->cached_up_comms != NULL && - han_module->cached_vranks != NULL) { - return; + if (han_module->enabled && han_module->cached_low_comms != NULL && + han_module->cached_up_comms != NULL && + han_module->cached_vranks != NULL) { + return OMPI_SUCCESS; } - /* We cannot use han allreduce and allgather without sub-communicators - * Temporary set previous ones */ - old_allreduce = comm->c_coll->coll_allreduce; - old_allreduce_module = comm->c_coll->coll_allreduce_module; - - old_allgather = comm->c_coll->coll_allgather; - old_allgather_module = comm->c_coll->coll_allgather_module; - - comm->c_coll->coll_allreduce = han_module->previous_allreduce; - comm->c_coll->coll_allreduce_module = han_module->previous_allreduce_module; - - comm->c_coll->coll_allgather = han_module->previous_allgather; - comm->c_coll->coll_allgather_module = han_module->previous_allgather_module; + /* + * We cannot use han allreduce and allgather without sub-communicators, + * but we are in the creation of the data structures for the HAN, and + * temporarily need to save back the old collective. + * + * Allgather is used to compute vranks + * Allreduce is used by ompi_comm_split_type in create_intranode_comm_new + * Reduce + Bcast may be called by the allreduce implementation + * Gather + Bcast may be called by the allgather implementation + */ + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_SAVE_COLLECTIVE(fallbacks, comm, han_module, scatter); + + /** + * HAN is not yet optimized for a single process per node case, we should + * avoid selecting it for collective communication support in such cases. + * However, in order to decide if this is tru, we need to know how many + * local processes are on each node, a condition that cannot be verified + * outside the MPI support (with PRRTE the info will be eventually available, + * but we don't want to delay anything until then). We can achieve the same + * goal by using a reduction over the maximum number of peers per node among + * all participants. + */ + int local_procs = ompi_group_count_local_peers(comm->c_local_group); + comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT, + MPI_MAX, comm, + comm->c_coll->coll_allreduce_module); + if( local_procs == 1 ) { + /* restore saved collectives */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + han_module->enabled = false; /* entire module set to pass-through from now on */ + return OMPI_ERR_NOT_SUPPORTED; + } /* create communicators if there is no cached communicator */ - w_rank = ompi_comm_rank(comm); w_size = ompi_comm_size(comm); low_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_LOW_MODULES * sizeof(struct ompi_communicator_t *)); up_comms = (struct ompi_communicator_t **)malloc(COLL_HAN_UP_MODULES * sizeof(struct ompi_communicator_t *)); - origin_priority = NULL; - mca_base_var_find_by_name("coll_han_priority", &han_var_id); - mca_base_var_get_value(han_var_id, &origin_priority, NULL, NULL); - /* - * Lower down our current priority - */ - tmp_han_priority = 0; - mca_base_var_set_flag(han_var_id, MCA_BASE_VAR_FLAG_SETTABLE, true); - mca_base_var_set_value(han_var_id, &tmp_han_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); + OBJ_CONSTRUCT(&comm_info, opal_info_t); /* * Upgrade sm module priority to set up low_comms[0] with sm module * This sub-communicator contains the ranks that share my node. */ - create_intranode_comm(comm, "coll_sm_priority", w_rank, &(low_comms[0])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[0])); /* * Get my local rank and the local size @@ -429,15 +272,17 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade shared module priority to set up low_comms[1] with shared module * This sub-communicator contains the ranks that share my node. */ - create_intranode_comm(comm, "coll_shared_priority", w_rank, &(low_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[1])); /* * Upgrade libnbc module priority to set up up_comms[0] with libnbc module * This sub-communicator contains one process per node: processes with the * same intra-node rank id share such a sub-communicator */ - create_internode_comm(comm, "coll_libnbc_priority", w_rank, low_rank, - &(up_comms[0])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false); up_rank = ompi_comm_rank(up_comms[0]); @@ -445,8 +290,8 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade adapt module priority to set up up_comms[0] with adapt module * This sub-communicator contains one process per node. */ - create_internode_comm(comm, "coll_adapt_priority", w_rank, low_rank, - &(up_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han"); + ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false); /* * Set my virtual rank number. @@ -468,23 +313,21 @@ void mca_coll_han_comm_create(struct ompi_communicator_t *comm, /* * Set the cached info */ - han_module->cached_comm = comm; han_module->cached_low_comms = low_comms; han_module->cached_up_comms = up_comms; han_module->cached_vranks = vranks; - /* - * Come back to the original han module priority - */ - mca_base_var_set_value(han_var_id, origin_priority, sizeof(int), - MCA_BASE_VAR_SOURCE_SET, NULL); - - /* Put allreduce and allgather back */ - comm->c_coll->coll_allreduce = old_allreduce; - comm->c_coll->coll_allreduce_module = old_allreduce_module; - - comm->c_coll->coll_allgather = old_allgather; - comm->c_coll->coll_allgather_module = old_allgather_module; + /* Reset the saved collectives to point back to HAN */ + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allreduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, bcast); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, reduce); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, gather); + HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, scatter); + + OBJ_DESTRUCT(&comm_info); + return OMPI_SUCCESS; } diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c index cbcfd698d05..e25e37207e2 100644 --- a/ompi/mca/coll/han/coll_han_topo.c +++ b/ompi/mca/coll/han/coll_han_topo.c @@ -35,313 +35,161 @@ #include "coll_han.h" -/* - * Local functions - */ - -static int mca_coll_han_hostname_to_number(char* hostname, int size); -static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level); -static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level); -static bool mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level); -static void mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level); - - -/* - * takes the number part of a host: hhh2031 -->2031 - */ -static int mca_coll_han_hostname_to_number(char* hostname, int size) +#if OPAL_ENABLE_DEBUG +static void +mca_coll_han_topo_print(int *topo, + struct ompi_communicator_t *comm, + int num_topo_level) { - int i, j; - char *number_array = (char *)malloc(sizeof(char) * size); - int number = 0; + int rank = ompi_comm_rank(comm); + int size = ompi_comm_size(comm); - for (i = 0, j = 0; hostname[i] != '\0'; i++) { - if ('0' <= hostname[i] && '9' >= hostname[i]) { - number_array[j++] = hostname[i]; + if (rank == 0) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han topo: ", rank)); + for( int i = 0; i < size*num_topo_level; i++ ) { + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "%d ", topo[i])); } + OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "\n")); } - number_array[j] = '\0'; - number = atoi(number_array); - free(number_array); - return number; } +#endif /* OPAL_ENABLE_DEBUG */ -/* - * Set the virtual topo id. It is made of num_topo_level ints (2 today): - * . the integer part of the host id - * . the rank in the main communicator - * Gather the virtual topoid from each process so every process will know other - * processes virtual topids - */ -static void mca_coll_han_topo_get(int *topo, - struct ompi_communicator_t* comm, - int num_topo_level) -{ - int *self_topo = (int *)malloc(sizeof(int) * num_topo_level); - char hostname[1024]; - - gethostname(hostname, 1024); - self_topo[0] = mca_coll_han_hostname_to_number(hostname, 1024); - self_topo[1] = ompi_comm_rank(comm); - - ompi_coll_base_allgather_intra_bruck(self_topo, num_topo_level, MPI_INT, - topo, num_topo_level, MPI_INT, comm, - comm->c_coll->coll_allgather_module); - free(self_topo); - - return; -} - -/* - * Sort the topology array in order to have ranks sharing the same node - * contiguous in the topology array. - * Called from topo_init whenever the processes are not mapped by core. - * ex: 4 ranks executing on 2 nodes, mapped by node - * ranks 0 and 2 on hid0 - * ranks 1 and 3 on hid1 - * On entry the topo array looks like - * hid0 0 hid1 1 hid0 2 hid1 3 - * After the sort: - * hid0 0 hid0 2 hid1 1 hid1 3 - * This is to have the gather result in the right order - * - * @param topo (IN/OUT) topology description array (sorted in out) - * @param start (IN) where to begin the processing - * The index in topo will actually be: - * start * num_topo_level + level - * topo contains num_topo_level ids per rank. - * @param end (IN) where to stop the processing - * The index in topo will actually be: - * end * num_topo_level + level - * topo contains num_topo_level ids per rank. - * @param level (IN) level number we are currently processing - * @param num_topo_level (IN) number of topological levels +/** + * Topology initialization phase + * Called each time a collective that needs buffer reordering is called * + * @param num_topo_level (IN) Number of the topological levels */ -static void mca_coll_han_topo_sort(int *topo, int start, int end, - int level, int num_topo_level) +int* +mca_coll_han_topo_init(struct ompi_communicator_t *comm, + mca_coll_han_module_t *han_module, + int num_topo_level) { - int i, j; - int min, min_loc; - int last, new_start, new_end; - - if (level > num_topo_level-1 || start >= end) { - return; - } - - min = INT_MAX; - min_loc = -1; - for (i = start; i <= end; i++) { - int temp; - /* get the min value for current level and its location */ - for (j = i; j <= end; j++) { - /* topo contains num_topo_level ids per rank. */ - if (topo[j * num_topo_level + level] < min) { - min = topo[j*num_topo_level+level]; - min_loc = j; - - } - } - /* - * swap i and min_loc - * We have num_topo_level ids to swap - */ - for (j = 0; j < num_topo_level; j++) { - temp = topo[i * num_topo_level + j]; - topo[i * num_topo_level + j] = topo[min_loc * num_topo_level + j]; - topo[min_loc * num_topo_level + j] = temp; - } - min = INT_MAX; - min_loc = -1; + if ( NULL != han_module->cached_topo ) { + return han_module->cached_topo; } - /* Process next level */ - last = 0; - new_start = 0; - new_end = 0; - for (i = start; i <= end; i++) { - if (i == start) { - last = topo[i * num_topo_level + level]; - new_start = start; - } else if (i == end) { - new_end = end; - mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); - } else if (last != topo[i * num_topo_level + level]) { - new_end = i - 1; - mca_coll_han_topo_sort(topo, new_start, new_end, level + 1, - num_topo_level); - new_start = i; - last = topo[i * num_topo_level + level]; - } - } - return; -} + ompi_communicator_t *up_comm, *low_comm; + ompi_request_t *request = MPI_REQUEST_NULL; + int *my_low_rank_map = NULL; + int *ranks_map = NULL; -/* - * Check whether the ranks in the communicator given as input are mapped by core - * Mapped by core: each node is first filled with as many ranks as needed before - * moving to the next one - * This is checked as follows: - * . 2 contiguous ranks should be either on the same node or on node ids in - * ascending order - * The topology is actually an array of ints: - * +----------+-------+----------+-------+------+----------+-------+-----+ - * | host_id0 | rank0 | host_id1 | rank1 | .... | host_idX | rankX | ... | - * +----------+-------+----------+-------+------+----------+-------+-----+ - */ -static bool mca_coll_han_topo_is_mapbycore(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ - int i; int size = ompi_comm_size(comm); - for (i = 1; i < size; i++) { - /* - * The host id for a given rank should be < host id for the next rank - */ - if (topo[(i - 1) * num_topo_level] > topo[i * num_topo_level]) { - return false; - } - /* - * For the same host id, consecutive ranks should be sorted in - * ascending order. - */ - if (topo[(i - 1) * num_topo_level + 1] > topo[i * num_topo_level + 1]) { - return false; - } - } - return true; -} - -/* The topo is supposed sorted by host */ -static bool mca_coll_han_topo_are_ppn_imbalanced(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level){ - int i; - int size = ompi_comm_size(comm); - if (size < 2){ - return false; + if (NULL != han_module->cached_up_comms) { + up_comm = han_module->cached_up_comms[0]; + low_comm = han_module->cached_low_comms[0]; + } else { + up_comm = han_module->sub_comm[INTER_NODE]; + low_comm = han_module->sub_comm[INTRA_NODE]; } - int ppn; - int last_host = topo[0]; + assert(up_comm != NULL && low_comm != NULL); - /* Find the ppn for the first node */ - for (i = 1; i < size; i++) { - if (topo[i * num_topo_level] != last_host){ - break; - } - } - ppn = i; + int low_rank = ompi_comm_rank(low_comm); + int low_size = ompi_comm_size(low_comm); - /* All on one node */ - if ( size == ppn){ - return false; - } - /* Trivial case */ - if (size % ppn != 0){ - return true; - } + int *topo = (int *)malloc(sizeof(int) * size * num_topo_level); + int is_imbalanced = 1; + int ranks_consecutive = 1; - last_host = topo[ppn * num_topo_level]; - /* Check that the 2nd and next hosts also this ppn. Since the topo is sorted - * one just need to jump ppn ranks to check the supposed switch of host */ - for (i = 2 * ppn; i < size; i += ppn ){ - /* the list of ranks for the last known host have ended before */ - if (topo[(i-1) * num_topo_level] != last_host){ - return true; + /* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */ + if (0 == low_rank) { + my_low_rank_map = malloc(sizeof(int)*low_size); + for (int i = 0; i < low_size; ++i) { + topo[i] = i; } - /* the list of ranks for the last known host are bigger than excpected */ - if (topo[(i-1) * num_topo_level] == topo[i*num_topo_level]){ - return true; + ompi_group_translate_ranks(low_comm->c_local_group, low_size, topo, + comm->c_local_group, my_low_rank_map); + /* check if ranks are consecutive */ + int rank = my_low_rank_map[0] + 1; + for (int i = 1; i < low_size; ++i, ++rank) { + if (my_low_rank_map[i] != rank) { + ranks_consecutive = 0; + break; + } } - last_host = topo[i * num_topo_level]; - } - /* Check the last host */ - if (topo[(size-1) * num_topo_level] != last_host){ - return true; - } - - return false; -} + int reduce_vals[] = {ranks_consecutive, -ranks_consecutive, low_size, -low_size}; -/** - * Topology initialization phase - * Called each time a collective that needs buffer reordering is called - * - * @param num_topo_level (IN) Number of the topological levels - */ -int *mca_coll_han_topo_init(struct ompi_communicator_t *comm, - mca_coll_han_module_t *han_module, - int num_topo_level) -{ - int size; - int *topo; + up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4, + MPI_INT, MPI_MAX, up_comm, + up_comm->c_coll->coll_allreduce_module); - size = ompi_comm_size(comm); + /* is the distribution of processes balanced per node? */ + is_imbalanced = (reduce_vals[2] == -reduce_vals[3]) ? 0 : 1; + ranks_consecutive = (reduce_vals[0] == -reduce_vals[1]) ? 1 : 0; - if (!((han_module->cached_topo) && (han_module->cached_comm == comm))) { - if (han_module->cached_topo) { - free(han_module->cached_topo); - han_module->cached_topo = NULL; + if ( !ranks_consecutive && !is_imbalanced ) { + /* kick off up_comm allgather to collect non-consecutive rank information at node leaders */ + ranks_map = malloc(sizeof(int)*size); + up_comm->c_coll->coll_iallgather(my_low_rank_map, low_size, MPI_INT, + ranks_map, low_size, MPI_INT, up_comm, &request, + up_comm->c_coll->coll_iallgather_module); } + } - topo = (int *)malloc(sizeof(int) * size * num_topo_level); - - /* get topo infomation */ - mca_coll_han_topo_get(topo, comm, num_topo_level); - mca_coll_han_topo_print(topo, comm, num_topo_level); - /* - * All the ranks now have the topo information - */ + /* broadcast balanced and consecutive properties from node leaders to remaining ranks */ + int bcast_vals[] = {is_imbalanced, ranks_consecutive}; + low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + is_imbalanced = bcast_vals[0]; + ranks_consecutive = bcast_vals[1]; + + /* error out if the rank distribution is not balanced */ + if (is_imbalanced) { + assert(MPI_REQUEST_NULL == request); + han_module->are_ppn_imbalanced = true; + free(topo); + if( NULL != my_low_rank_map ) free(my_low_rank_map); + if( NULL != ranks_map ) free(ranks_map); + return NULL; + } - /* check if the processes are mapped by core */ - han_module->is_mapbycore = mca_coll_han_topo_is_mapbycore(topo, comm, num_topo_level); + han_module->are_ppn_imbalanced = false; + if (ranks_consecutive) { + /* fast-path: all ranks are consecutive and balanced so fill topology locally */ + for (int i = 0; i < size; ++i) { + topo[2*i] = (i/low_size); // node leader is node ID + topo[2*i+1] = i; + } + han_module->is_mapbycore = true; + } else { /* - * If not, sort the topo such that each group of ids is sorted by rank - * i.e. ids for rank i are contiguous to ids for rank i+1. - * This will be needed for the operations that are order sensitive - * (like gather) + * Slow path: gather global-to-node-local rank mappings at node leaders + * + * The topology will contain a mapping from global consecutive positions + * to ranks in the communicator. + * + * ex: 4 ranks executing on 2 nodes, mapped by node + * ranks 0 and 2 on hid0 + * ranks 1 and 3 on hid1 + * On entry the topo array looks like + * hid0 0 hid1 1 hid0 2 hid1 3 + * After the sort: + * hid0 0 hid0 2 hid1 1 hid1 3 */ - if (!han_module->is_mapbycore) { - mca_coll_han_topo_sort(topo, 0, size-1, 0, num_topo_level); + if (0 == low_rank) { + ompi_request_wait(&request, MPI_STATUS_IGNORE); + /* fill topology */ + for (int i = 0; i < size; ++i) { + topo[2*i] = ranks_map[(i/low_size)*low_size]; // node leader is node ID + topo[2*i+1] = ranks_map[i]; + } + free(ranks_map); } - han_module->are_ppn_imbalanced = mca_coll_han_topo_are_ppn_imbalanced(topo, comm , num_topo_level); - han_module->cached_topo = topo; - han_module->cached_comm = comm; - } else { - topo = han_module->cached_topo; } + /* broadcast topology from node leaders to remaining ranks */ + low_comm->c_coll->coll_bcast(topo, num_topo_level*size, MPI_INT, 0, + low_comm, low_comm->c_coll->coll_bcast_module); + free(my_low_rank_map); + han_module->cached_topo = topo; +#if OPAL_ENABLE_DEBUG mca_coll_han_topo_print(topo, comm, num_topo_level); - return topo; -} - -static void mca_coll_han_topo_print(int *topo, - struct ompi_communicator_t *comm, - int num_topo_level) -{ - int rank = ompi_comm_rank(comm); - int size = ompi_comm_size(comm); +#endif /* OPAL_ENABLE_DEBUG */ - if (rank == 0) { - int i; - OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output, "[%d]: Han Scatter topo: ", rank)); - for (i=0; ifunc_ptr = NULL; - t->func_argu = NULL; + t->func_args = NULL; } static void mca_coll_task_destructor(mca_coll_task_t * t) { t->func_ptr = NULL; - t->func_argu = NULL; + t->func_args = NULL; } OBJ_CLASS_INSTANCE(mca_coll_task_t, opal_object_t, mca_coll_task_constructor, mca_coll_task_destructor); - -/* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu) -{ - t->func_ptr = func_ptr; - t->func_argu = func_argu; - return OMPI_SUCCESS; -} - -/* Issue the task */ -int issue_task(mca_coll_task_t * t) -{ - t->func_ptr(t->func_argu); - return OMPI_SUCCESS; -} diff --git a/ompi/mca/coll/han/coll_han_trigger.h b/ompi/mca/coll/han/coll_han_trigger.h index c7314d25fb8..413e393be61 100644 --- a/ompi/mca/coll/han/coll_han_trigger.h +++ b/ompi/mca/coll/han/coll_han_trigger.h @@ -12,25 +12,17 @@ #ifndef MCA_COLL_HAN_TRIGGER_EXPORT_H #define MCA_COLL_HAN_TRIGGER_EXPORT_H -#include "ompi_config.h" -#include "mpi.h" -#include "ompi/mca/mca.h" -#include "ompi/mca/coll/coll.h" #include "ompi/communicator/communicator.h" -#include "ompi/win/win.h" -#include "ompi/mca/coll/base/coll_base_functions.h" -#include "opal/util/info.h" #include "ompi/op/op.h" -#include "opal/runtime/opal_progress.h" -#include "ompi/mca/pml/pml.h" -#include "ompi/mca/coll/base/coll_tags.h" +#include "ompi/datatype/ompi_datatype.h" + typedef int (*task_func_ptr) (void *); struct mca_coll_task_s { opal_object_t super; task_func_ptr func_ptr; - void *func_argu; + void *func_args; }; typedef struct mca_coll_task_s mca_coll_task_t; @@ -38,9 +30,20 @@ typedef struct mca_coll_task_s mca_coll_task_t; OBJ_CLASS_DECLARATION(mca_coll_task_t); /* Init task */ -int init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_argu); +static inline int +init_task(mca_coll_task_t * t, task_func_ptr func_ptr, void *func_args) +{ + OBJ_CONSTRUCT(t, mca_coll_task_t); + t->func_ptr = func_ptr; + t->func_args = func_args; + return OMPI_SUCCESS; +} /* Issue the task */ -int issue_task(mca_coll_task_t * t); +static inline int +issue_task(mca_coll_task_t * t) +{ + return t->func_ptr(t->func_args); +} -#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ +#endif /* MCA_COLL_HAN_TRIGGER_EXPORT_H */ diff --git a/ompi/mca/coll/han/coll_han_utils.c b/ompi/mca/coll/han/coll_han_utils.c deleted file mode 100644 index 293777a256e..00000000000 --- a/ompi/mca/coll/han/coll_han_utils.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018-2020 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "coll_han.h" - -/* Get root's low_rank and up_rank from vranks array */ -void mca_coll_han_get_ranks(int *vranks, int root, int low_size, int *root_low_rank, - int *root_up_rank) -{ - *root_up_rank = vranks[root] / low_size; - *root_low_rank = vranks[root] % low_size; -} - -uint32_t han_auto_tuned_get_n(uint32_t n) -{ - uint32_t avail[5] = { 4, 8, 16, 32, 64 }; - uint32_t i; - for (i = 0; i < 5; i++) { - if (avail[i] >= n) { - return i; - } - } - return i - 1; -} - -uint32_t han_auto_tuned_get_c(uint32_t c) -{ - uint32_t avail[3] = { 4, 8, 12 }; - uint32_t i; - for (i = 0; i < 3; i++) { - if (avail[i] >= c) { - return i; - } - } - return i - 1; -} - -uint32_t han_auto_tuned_get_m(uint32_t m) -{ - uint32_t avail[21] = - { 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, -262144, 524288, 1048576, 2097152, 4194304 }; - uint32_t i; - for (i = 0; i < 21; i++) { - if (avail[i] >= m) { - return i; - } - } - return i - 1; -} diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 781215251ea..25e9c779467 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -176,7 +176,7 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority) if (OMPI_COMM_IS_INTER(comm) || 1 == ompi_comm_size(comm) || ompi_group_have_remote_peers (comm->c_local_group)) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:comm_query (%d/%s): intercomm, comm is too small, or not all peers local; disqualifying myself", comm->c_contextid, comm->c_name); - return NULL; + return NULL; } /* Get the priority level attached to this module. If priority is less diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index cc73fcf835b..637122185e5 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -1446,7 +1446,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, int scount, communicator_size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - if (rank == root) { + /* Determine block size */ + if ( (rank == root) || (MPI_IN_PLACE == sbuf) ) { ompi_datatype_type_size(rdtype, &dsize); total_dsize = dsize * (ptrdiff_t)rcount; } else { diff --git a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c index 098a4fa9491..a259c789ac2 100644 --- a/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c +++ b/ompi/mca/coll/tuned/coll_tuned_dynamic_file.c @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2015 The University of Tennessee and The University + * Copyright (c) 2004-2020 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -40,7 +40,7 @@ static int fileline=0; /* used for verbose error messages */ -#define getnext(fptr) ompi_coll_base_file_getnext(fptr, &fileline) +#define getnext(fptr, pval) ompi_coll_base_file_getnext_long(fptr, &fileline, pval) /* * Reads a rule file called fname @@ -56,9 +56,8 @@ static int fileline=0; /* used for verbose error messages */ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** rules, int n_collectives) { + long CI, NCS, CS, ALG, NMS, FANINOUT, X, MS, SS; FILE *fptr = (FILE*) NULL; - int X, CI, NCS, CS, ALG, NMS, FANINOUT; - long MS, SS; int x, ncs, nms; ompi_coll_alg_rule_t *alg_rules = (ompi_coll_alg_rule_t*) NULL; /* complete table of rules */ @@ -101,45 +100,42 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** goto on_file_error; } - X = (int)getnext(fptr); - if (X<0) { + if( (getnext(fptr, &X) < 0) || (X < 0) ) { OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of collectives in configuration file around line %d\n", fileline)); goto on_file_error; } if (X>n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %d is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Number of collectives in configuration file %ld is greater than number of MPI collectives possible %d ??? error around line %d\n", X, n_collectives, fileline)); goto on_file_error; } for (x=0;x=n_collectives) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %d is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Collective id in configuration file %ld is greater than MPI collectives possible %d. Error around line %d\n", CI, n_collectives, fileline)); goto on_file_error; } if (alg_rules[CI].alg_rule_id != CI) { - OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Internal error in handling collective ID %ld\n", CI)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Reading dynamic rule for collective ID %ld\n", CI)); alg_p = &alg_rules[CI]; alg_p->alg_rule_id = CI; alg_p->n_com_sizes = 0; alg_p->com_rules = (ompi_coll_com_rule_t *) NULL; - NCS = (int)getnext (fptr); - if (NCS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %d at around line %d\n", CI, fileline)); + if( (getnext (fptr, &NCS) < 0) || (NCS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read count of communicators for collective ID %ld at around line %d\n", CI, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %d for dynamic rule for collective ID %d\n", NCS, CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read communicator count %ld for dynamic rule for collective ID %ld\n", NCS, CI)); alg_p->n_com_sizes = NCS; alg_p->com_rules = ompi_coll_tuned_mk_com_rules (NCS, CI); if (NULL == alg_p->com_rules) { @@ -151,20 +147,18 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** com_p = &(alg_p->com_rules[ncs]); - CS = (int)getnext (fptr); - if (CS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &CS) < 0) || (CS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read communicator size for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } com_p->mpi_comsize = CS; - NMS = (int)getnext (fptr); - if (NMS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %d com rule %d at around line %d\n", CI, ncs, fileline)); + if( (getnext (fptr, &NMS) < 0) || (NMS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read number of message sizes for collective ID %ld com rule %d at around line %d\n", CI, ncs, fileline)); goto on_file_error; } - OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %d for dynamic rule for collective ID %d and comm size %d\n", + OPAL_OUTPUT((ompi_coll_tuned_stream, "Read message count %ld for dynamic rule for collective ID %ld and comm size %ld\n", NMS, CI, CS)); com_p->n_msg_sizes = NMS; com_p->msg_rules = ompi_coll_tuned_mk_msg_rules (NMS, CI, ncs, CS); @@ -179,37 +173,33 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** msg_p = &(com_p->msg_rules[nms]); - MS = getnext (fptr); - if (MS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &MS) < 0) || (MS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read message size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->msg_size = (size_t)MS; - ALG = (int)getnext (fptr); - if (ALG<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &ALG) < 0) || (ALG < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target algorithm method for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_alg = ALG; - FANINOUT = (int)getnext (fptr); - if (FANINOUT<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &FANINOUT) < 0) || (FANINOUT < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read fan in/out topo for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_topo_faninout = FANINOUT; - SS = getnext (fptr); - if (SS<0) { - OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %d com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); + if( (getnext (fptr, &SS) < 0) || (SS < 0) ) { + OPAL_OUTPUT((ompi_coll_tuned_stream,"Could not read target segment size for collective ID %ld com rule %d msg rule %d at around line %d\n", CI, ncs, nms, fileline)); goto on_file_error; } msg_p->result_segsize = SS; if (!nms && MS) { OPAL_OUTPUT((ompi_coll_tuned_stream,"All algorithms must specify a rule for message size of zero upwards always first!\n")); - OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %d com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); + OPAL_OUTPUT((ompi_coll_tuned_stream,"Message size was %lu for collective ID %ld com rule %d msg rule %d at around line %d\n", MS, CI, ncs, nms, fileline)); goto on_file_error; } @@ -222,7 +212,7 @@ int ompi_coll_tuned_read_rules_config_file (char *fname, ompi_coll_alg_rule_t** } /* comm size */ total_alg_count++; - OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %d\n", CI)); + OPAL_OUTPUT((ompi_coll_tuned_stream, "Done reading dynamic rule for collective ID %ld\n", CI)); } /* per collective */ diff --git a/ompi/request/request.c b/ompi/request/request.c index a8ddb68ad3a..abf33449d89 100644 --- a/ompi/request/request.c +++ b/ompi/request/request.c @@ -54,7 +54,7 @@ static void ompi_request_construct(ompi_request_t* req) /* don't call _INIT, we don't to set the request to _INACTIVE and there will * be no matching _FINI invocation */ req->req_state = OMPI_REQUEST_INVALID; - req->req_complete = false; + req->req_complete = REQUEST_COMPLETED; req->req_persistent = false; req->req_start = NULL; req->req_free = NULL; diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c index efed62451ac..6f9fdce2774 100644 --- a/opal/datatype/opal_datatype_unpack.c +++ b/opal/datatype/opal_datatype_unpack.c @@ -380,7 +380,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, } complete_loop: assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED ); - if( 0 != iov_len_local ) { + if( (pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA) && (0 != iov_len_local) ) { unsigned char* temp = conv_ptr; /* We have some partial data here. Let's copy it into the convertor * and keep it hot until the next round. @@ -391,7 +391,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor, opal_unpack_partial_datatype( pConvertor, pElem, iov_ptr, 0, iov_len_local, &temp ); - + pConvertor->partial_length = iov_len_local; iov_len_local = 0; }