@@ -142,7 +142,7 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
142142 }
143143 }
144144 }
145- * out_size = iov_idx ; /* we only reach this line after the for loop succesfully complete */
145+ * out_size = iov_idx ; /* we only reach this line after the for loop successfully complete */
146146 * max_data = pConv -> bConverted - initial_bytes_converted ;
147147 if ( pConv -> bConverted == pConv -> local_size ) pConv -> flags |= CONVERTOR_COMPLETED ;
148148 return !!(pConv -> flags & CONVERTOR_COMPLETED ); /* done or not */
@@ -161,60 +161,70 @@ opal_unpack_homogeneous_contig_function( opal_convertor_t* pConv,
161161 * of the exponent or mantissa).
162162 */
163163static inline void
164- opal_unpack_partial_datatype ( opal_convertor_t * pConvertor , dt_elem_desc_t * pElem ,
165- unsigned char * partial_data ,
166- ptrdiff_t start_position , size_t length ,
167- unsigned char * * user_buffer )
164+ opal_unpack_partial_predefined (opal_convertor_t * pConvertor , const dt_elem_desc_t * pElem ,
165+ size_t * COUNT , unsigned char * * packed ,
166+ unsigned char * * memory , size_t * SPACE )
168167{
169168 char unused_byte = 0x7F , saved_data [16 ];
170169 unsigned char temporary [16 ], * temporary_buffer = temporary ;
171- unsigned char * user_data = * user_buffer + pElem -> elem .disp ;
172- size_t count_desc = 1 ;
170+ unsigned char * user_data = * memory + pElem -> elem .disp ;
173171 size_t data_length = opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size ;
172+ unsigned char * partial_data = * packed ;
173+ ptrdiff_t start_position = pConvertor -> partial_length ;
174+ size_t length = data_length - start_position ;
175+ size_t count_desc = 1 ;
176+ dt_elem_desc_t single_elem = { .elem = { .common = pElem -> elem .common , .count = 1 , .blocklen = 1 ,
177+ .extent = data_length , /* advance by a full data element */
178+ .disp = 0 /* right where the pointer is */ } };
179+ if ( * SPACE < length ) {
180+ length = * SPACE ;
181+ }
174182
175183 DO_DEBUG ( opal_output ( 0 , "unpack partial data start %lu end %lu data_length %lu user %p\n"
176184 "\tbConverted %lu total_length %lu count %ld\n" ,
177- (unsigned long )start_position , (unsigned long )start_position + length , (unsigned long )data_length , (void * )* user_buffer ,
178- (unsigned long )pConvertor -> bConverted , (unsigned long )pConvertor -> local_size , pConvertor -> count ); );
179-
180- /* Find a byte that is not used in the partial buffer */
185+ (unsigned long )start_position , (unsigned long )start_position + length ,
186+ (unsigned long )data_length , (void * )* memory ,
187+ (unsigned long )pConvertor -> bConverted ,
188+ (unsigned long )pConvertor -> local_size , pConvertor -> count ); );
189+ COMPUTE_CSUM ( partial_data , length , pConvertor );
190+
191+ /* Find a byte value that is not used in the partial buffer. We use it as a marker
192+ * to identify what has not been modified by the unpack call. */
181193 find_unused_byte :
182- for (size_t i = 0 ; i < length ; i ++ ) {
194+ for (size_t i = 0 ; i < length ; i ++ ) {
183195 if ( unused_byte == partial_data [i ] ) {
184196 unused_byte -- ;
185197 goto find_unused_byte ;
186198 }
187199 }
188200
189- /* Copy and fill the rest of the buffer with the unused byte */
201+ /* Prepare an full element of the predefined type, by populating an entire type
202+ * with the unused byte and then put the partial data at the right position. */
190203 memset ( temporary , unused_byte , data_length );
191204 MEMCPY ( temporary + start_position , partial_data , length );
192205
206+ /* Save the original content of the user memory */
193207#if OPAL_CUDA_SUPPORT
194208 /* In the case where the data is being unpacked from device memory, need to
195- * use the special host to device memory copy. Note this code path was only
196- * seen on large receives of noncontiguous data via buffered sends. */
209+ * use the special host to device memory copy. */
197210 pConvertor -> cbmemcpy (saved_data , user_data , data_length , pConvertor );
198211#else
199- /* Save the content of the user memory */
200212 MEMCPY ( saved_data , user_data , data_length );
201213#endif
202214
203215 /* Then unpack the data into the user memory */
204- UNPACK_PREDEFINED_DATATYPE ( pConvertor , pElem , count_desc ,
205- temporary_buffer , * user_buffer , data_length );
216+ UNPACK_PREDEFINED_DATATYPE (pConvertor , & single_elem , count_desc , temporary_buffer , user_data ,
217+ data_length );
206218
207- /* reload the length as it is reset by the macro */
219+ /* reload the length and user buffer as they have been updated by the macro */
208220 data_length = opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size ;
221+ user_data = * memory + pElem -> elem .disp ;
209222
210- /* For every occurence of the unused byte move data from the saved
211- * buffer back into the user memory.
212- */
223+ /* Rebuild the data by pulling back the unmodified bytes from the original
224+ * content in the user memory. */
213225#if OPAL_CUDA_SUPPORT
214226 /* Need to copy the modified user_data again so we can see which
215- * bytes need to be converted back to their original values. Note
216- * this code path was only seen on large receives of noncontiguous
217- * data via buffered sends. */
227+ * bytes need to be converted back to their original values. */
218228 {
219229 char resaved_data [16 ];
220230 pConvertor -> cbmemcpy (resaved_data , user_data , data_length , pConvertor );
@@ -229,6 +239,16 @@ opal_unpack_partial_datatype( opal_convertor_t* pConvertor, dt_elem_desc_t* pEle
229239 user_data [i ] = saved_data [i ];
230240 }
231241#endif
242+ pConvertor -> partial_length = (pConvertor -> partial_length + length ) % data_length ;
243+ * SPACE -= length ;
244+ * packed += length ;
245+ if (0 == pConvertor -> partial_length ) {
246+ (* COUNT )-- ; /* we have enough to complete one full predefined type */
247+ * memory += data_length ;
248+ if (0 == (* COUNT % pElem -> elem .blocklen )) {
249+ * memory += pElem -> elem .extent - (pElem -> elem .blocklen * data_length );
250+ }
251+ }
232252}
233253
234254/* The pack/unpack functions need a cleanup. I have to create a proper interface to access
@@ -257,8 +277,8 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
257277 size_t iov_len_local ;
258278 uint32_t iov_count ;
259279
260- DO_DEBUG ( opal_output ( 0 , "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n" ,
261- (void * )pConvertor , (void * )iov [0 ].iov_base , (unsigned long )iov [0 ].iov_len , * out_size ); );
280+ DO_DEBUG ( opal_output ( 0 , "opal_convertor_generic_simple_unpack( %p, iov[%u] = {%p, %lu} )\n" ,
281+ (void * )pConvertor , * out_size , (void * )iov [0 ].iov_base , (unsigned long )iov [0 ].iov_len ); );
262282
263283 description = pConvertor -> use_desc -> desc ;
264284
@@ -283,28 +303,26 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
283303 iov_ptr = (unsigned char * ) iov [iov_count ].iov_base ;
284304 iov_len_local = iov [iov_count ].iov_len ;
285305
286- if ( 0 != pConvertor -> partial_length ) {
287- size_t element_length = opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size ;
288- size_t missing_length = element_length - pConvertor -> partial_length ;
289-
290- assert ( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA );
291- COMPUTE_CSUM ( iov_ptr , missing_length , pConvertor );
292- opal_unpack_partial_datatype ( pConvertor , pElem ,
293- iov_ptr ,
294- pConvertor -> partial_length , (size_t )(element_length - pConvertor -> partial_length ),
295- & conv_ptr );
296- -- count_desc ;
297- if ( 0 == count_desc ) {
298- conv_ptr = pConvertor -> pBaseBuf + pStack -> disp ;
299- pos_desc ++ ; /* advance to the next data */
300- UPDATE_INTERNAL_COUNTERS ( description , pos_desc , pElem , count_desc );
306+ /* Deal with all types of partial predefined datatype unpacking, including when
307+ * unpacking a partial predefined element and when unpacking a part smaller than
308+ * the blocklen.
309+ */
310+ if (pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) {
311+ if (0 != pConvertor -> partial_length ) { /* partial predefined element */
312+ assert ( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA );
313+ opal_unpack_partial_predefined ( pConvertor , pElem , & count_desc ,
314+ & iov_ptr , & conv_ptr , & iov_len_local );
315+ if (0 == count_desc ) { /* the end of the vector ? */
316+ assert ( 0 == pConvertor -> partial_length );
317+ conv_ptr = pConvertor -> pBaseBuf + pStack -> disp ;
318+ pos_desc ++ ; /* advance to the next data */
319+ UPDATE_INTERNAL_COUNTERS (description , pos_desc , pElem , count_desc );
320+ goto next_vector ;
321+ }
322+ if ( 0 == iov_len_local )
323+ goto complete_loop ;
301324 }
302- iov_ptr += missing_length ;
303- iov_len_local -= missing_length ;
304- pConvertor -> partial_length = 0 ; /* nothing more inside */
305- }
306- if ( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) {
307- if ( ((size_t )pElem -> elem .count * pElem -> elem .blocklen ) != count_desc ) {
325+ if (((size_t ) pElem -> elem .count * pElem -> elem .blocklen ) != count_desc ) {
308326 /* we have a partial (less than blocklen) basic datatype */
309327 int rc = UNPACK_PARTIAL_BLOCKLEN ( pConvertor , pElem , count_desc ,
310328 iov_ptr , conv_ptr , iov_len_local );
@@ -318,8 +336,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
318336 }
319337 }
320338
321- while ( 1 ) {
322- while ( pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) {
339+ while (1 ) {
340+ next_vector :
341+ while (pElem -> elem .common .flags & OPAL_DATATYPE_FLAG_DATA ) {
323342 /* we have a basic datatype (working on full blocks) */
324343 UNPACK_PREDEFINED_DATATYPE ( pConvertor , pElem , count_desc ,
325344 iov_ptr , conv_ptr , iov_len_local );
@@ -386,14 +405,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
386405 * and keep it hot until the next round.
387406 */
388407 assert ( iov_len_local < opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size );
389- COMPUTE_CSUM ( iov_ptr , iov_len_local , pConvertor );
390-
391- opal_unpack_partial_datatype ( pConvertor , pElem ,
392- iov_ptr , 0 , iov_len_local ,
393- & temp );
394-
395- pConvertor -> partial_length = iov_len_local ;
396- iov_len_local = 0 ;
408+ opal_unpack_partial_predefined (pConvertor , pElem , & count_desc , & iov_ptr , & temp , & iov_len_local );
397409 }
398410
399411 iov [iov_count ].iov_len -= iov_len_local ; /* update the amount of valid data */
@@ -520,11 +532,6 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec
520532 unsigned char * conv_ptr , * iov_ptr ;
521533 uint32_t iov_count ;
522534 size_t iov_len_local ;
523- #if 0
524- const opal_convertor_master_t * master = pConvertor -> master ;
525- ptrdiff_t advance ; /* number of bytes that we should advance the buffer */
526- #endif
527- size_t rc ;
528535
529536 DO_DEBUG ( opal_output ( 0 , "opal_convertor_general_unpack( %p, {%p, %lu}, %d )\n" ,
530537 (void * )pConvertor , (void * )iov [0 ].iov_base , (unsigned long )iov [0 ].iov_len , * out_size ); );
@@ -595,14 +602,9 @@ int32_t opal_unpack_general_function(opal_convertor_t *pConvertor, struct iovec
595602 * and keep it hot until the next round.
596603 */
597604 assert (iov_len_local < opal_datatype_basicDatatypes [pElem -> elem .common .type ]-> size );
598- COMPUTE_CSUM (iov_ptr , iov_len_local , pConvertor );
599-
600- opal_unpack_partial_datatype ( pConvertor , pElem ,
601- iov_ptr , 0 , iov_len_local ,
602- & temp );
603-
604- pConvertor -> partial_length = iov_len_local ;
605- iov_len_local = 0 ;
605+ opal_unpack_partial_predefined (pConvertor , pElem , & count_desc , & iov_ptr ,
606+ & temp , & iov_len_local );
607+ assert ( 0 == iov_len_local );
606608 }
607609 goto complete_loop ;
608610 }
0 commit comments