Skip to content

v3.0.x: pml/ob1: fixed OOS message handling. #4869

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions ompi/mca/pml/ob1/pml_ob1.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2012 The University of Tennessee and The University
* Copyright (c) 2004-2018 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -223,8 +223,6 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
opal_list_remove_item (&mca_pml_ob1.non_existing_communicator_pending,
(opal_list_item_t *) frag);

add_fragment_to_unexpected:

/* We generate the MSG_ARRIVED event as soon as the PML is aware
* of a matching fragment arrival. Independing if it is received
* on the correct order or not. This will allow the tools to
Expand All @@ -242,7 +240,9 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
*/
pml_proc = mca_pml_ob1_peer_lookup(comm, hdr->hdr_src);

if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
if (((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {

add_fragment_to_unexpected:
/* We're now expecting the next sequence number. */
pml_proc->expected_sequence++;
opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
Expand All @@ -254,19 +254,16 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
* situation as the cant_match is only checked when a new fragment is received from
* the network.
*/
for(frag = (mca_pml_ob1_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match);
frag != (mca_pml_ob1_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match);
frag = (mca_pml_ob1_recv_frag_t *)opal_list_get_next(frag)) {
hdr = &frag->hdr.hdr_match;
/* If the message has the next expected seq from that proc... */
if(hdr->hdr_seq != pml_proc->expected_sequence)
continue;

opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
goto add_fragment_to_unexpected;
}
if( NULL != pml_proc->frags_cant_match ) {
frag = check_cantmatch_for_match(pml_proc);
if( NULL != frag ) {
hdr = &frag->hdr.hdr_match;
goto add_fragment_to_unexpected;
}
}
} else {
opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
append_frag_to_ordered_list(&pml_proc->frags_cant_match, frag,
pml_proc->expected_sequence);
}
}
return OMPI_SUCCESS;
Expand Down Expand Up @@ -553,6 +550,23 @@ static void mca_pml_ob1_dump_frag_list(opal_list_t* queue, bool is_req)
}
}

void mca_pml_ob1_dump_cant_match(mca_pml_ob1_recv_frag_t* queue)
{
mca_pml_ob1_recv_frag_t* item = queue;

do {
mca_pml_ob1_dump_hdr( &item->hdr );
if( NULL != item->range ) {
mca_pml_ob1_recv_frag_t* frag = item->range;
do {
mca_pml_ob1_dump_hdr( &frag->hdr );
frag = (mca_pml_ob1_recv_frag_t*)frag->super.super.opal_list_next;
} while( frag != item->range );
}
item = (mca_pml_ob1_recv_frag_t*)item->super.super.opal_list_next;
} while( item != queue );
}

int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
{
struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
Expand Down Expand Up @@ -588,9 +602,9 @@ int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
opal_output(0, "expected specific receives\n");
mca_pml_ob1_dump_frag_list(&proc->specific_receives, true);
}
if( opal_list_get_size(&proc->frags_cant_match) ) {
if( NULL != proc->frags_cant_match ) {
opal_output(0, "out of sequence\n");
mca_pml_ob1_dump_frag_list(&proc->frags_cant_match, false);
mca_pml_ob1_dump_cant_match(proc->frags_cant_match);
}
if( opal_list_get_size(&proc->unexpected_frags) ) {
opal_output(0, "unexpected frag\n");
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/pml/ob1/pml_ob1.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* Copyright (c) 2004-2018 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down
6 changes: 3 additions & 3 deletions ompi/mca/pml/ob1/pml_ob1_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* Copyright (c) 2004-2018 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand All @@ -29,15 +29,15 @@ static void mca_pml_ob1_comm_proc_construct(mca_pml_ob1_comm_proc_t* proc)
proc->ompi_proc = NULL;
proc->expected_sequence = 1;
proc->send_sequence = 0;
OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t);
proc->frags_cant_match = NULL;
OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t);
OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t);
}


static void mca_pml_ob1_comm_proc_destruct(mca_pml_ob1_comm_proc_t* proc)
{
OBJ_DESTRUCT(&proc->frags_cant_match);
assert(NULL == proc->frags_cant_match);
OBJ_DESTRUCT(&proc->specific_receives);
OBJ_DESTRUCT(&proc->unexpected_frags);
if (proc->ompi_proc) {
Expand Down
4 changes: 2 additions & 2 deletions ompi/mca/pml/ob1/pml_ob1_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* Copyright (c) 2004-2018 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -40,7 +40,7 @@ struct mca_pml_ob1_comm_proc_t {
#else
int32_t send_sequence; /**< send side sequence number */
#endif
opal_list_t frags_cant_match; /**< out-of-order fragment queues */
struct mca_pml_ob1_recv_frag_t* frags_cant_match; /**< out-of-order fragment queues */
opal_list_t specific_receives; /**< queues of unmatched specific receives */
opal_list_t unexpected_frags; /**< unexpected fragment queues */
};
Expand Down
Loading