Skip to content

Commit 4e9d64b

Browse files
committed
pp_reverse - chunk-at-a-time string reversal
The performance characteristics of string reversal in blead is very variable depending upon the capabilities of the C compiler. Some compilers are able to vectorize some cases for better performance. This commit introduces explicit reversal and swapping of whole registers at a time, which all builds seem to be able to benefit from. The `_swab_xx_` macros for doing this already exist in perl.h, using them for this purpose was inspired by https://dev.to/wunk/fast-array-reversal-with-simd-j3p The bit shifting done by these macros should be portable and reasonably performant if not optimised further, but it is likely that they will be optimised to bswap, rev, movbe instructions. Some performance comparisons: 1. Large string reversal, with different source & destination buffers my $x = "X"x(1024*1000*10); my $y; for (0..1_000) { $y = reverse $x } gcc blead: 2,388.30 msec task-clock # 0.993 CPUs utilized 10,574,195,388 cycles # 4.427 GHz 61,520,672,268 instructions # 5.82 insn per cycle 10,255,049,869 branches # 4.294 G/sec clang blead: 688.37 msec task-clock # 0.946 CPUs utilized 3,161,754,439 cycles # 4.593 GHz 8,986,420,860 instructions # 2.84 insn per cycle 324,734,391 branches # 471.745 M/sec gcc patched: 408.39 msec task-clock # 0.936 CPUs utilized 1,617,273,653 cycles # 3.960 GHz 6,422,991,675 instructions # 3.97 insn per cycle 644,856,283 branches # 1.579 G/sec clang patched: 397.61 msec task-clock # 0.924 CPUs utilized 1,655,838,316 cycles # 4.165 GHz 5,782,487,237 instructions # 3.49 insn per cycle 324,586,437 branches # 816.350 M/sec 2. Large string reversal, but reversing the buffer in-place my $x = "X"x(1024*1000*10); my $y; for (0..1_000) { $y = reverse "foo",$x } gcc blead: 6,038.06 msec task-clock # 0.996 CPUs utilized 27,109,273,840 cycles # 4.490 GHz 41,987,097,139 instructions # 1.55 insn per cycle 5,211,350,347 branches # 863.083 M/sec clang blead: 5,815.86 msec task-clock # 0.995 CPUs utilized 26,962,768,616 cycles # 4.636 GHz 47,111,208,664 instructions # 1.75 insn per cycle 5,211,117,921 branches # 896.018 M/sec gcc patched: 1,003.49 msec task-clock # 0.999 CPUs utilized 4,298,242,624 cycles # 4.283 GHz 7,387,822,303 instructions # 1.72 insn per cycle 725,892,855 branches # 723.367 M/sec clang patched: 970.78 msec task-clock # 0.973 CPUs utilized 4,436,489,695 cycles # 4.570 GHz 8,028,374,567 instructions # 1.81 insn per cycle 725,867,979 branches # 747.713 M/sec 3. Short string reversal, different source & destination (checking performance on smaller string reversals - note: this one's vary variable due to noise) my $x = "1234567"; my $y; for (0..10_000_000) { $y = reverse $x } gcc blead: 401.20 msec task-clock # 0.916 CPUs utilized 1,672,263,966 cycles # 4.168 GHz 5,564,078,603 instructions # 3.33 insn per cycle 1,250,983,219 branches # 3.118 G/sec clang blead: 380.58 msec task-clock # 0.998 CPUs utilized 1,615,634,265 cycles # 4.245 GHz 5,583,854,366 instructions # 3.46 insn per cycle 1,300,935,443 branches # 3.418 G/sec gcc patched: 381.62 msec task-clock # 0.999 CPUs utilized 1,566,807,988 cycles # 4.106 GHz 5,474,069,670 instructions # 3.49 insn per cycle 1,240,983,221 branches # 3.252 G/sec clang patched: 346.21 msec task-clock # 0.999 CPUs utilized 1,600,780,787 cycles # 4.624 GHz 5,493,773,623 instructions # 3.43 insn per cycle 1,270,915,076 branches # 3.671 G/sec
1 parent 407267b commit 4e9d64b

File tree

1 file changed

+133
-18
lines changed

1 file changed

+133
-18
lines changed

pp.c

Lines changed: 133 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6464,7 +6464,6 @@ PP(pp_unshift)
64646464
return NORMAL;
64656465
}
64666466

6467-
64686467
PP_wrapped(pp_reverse, 0, 1)
64696468
{
64706469
dSP; dMARK;
@@ -6490,15 +6489,17 @@ PP_wrapped(pp_reverse, 0, 1)
64906489
SV *begin, *end;
64916490

64926491
if (can_preserve) {
6493-
if (!av_exists(av, i)) {
6494-
if (av_exists(av, j)) {
6492+
bool exists_i = av_exists(av, i);
6493+
bool exists_j = av_exists(av, j);
6494+
if (!exists_i) {
6495+
if (exists_j) {
64956496
SV *sv = av_delete(av, j, 0);
64966497
begin = *av_fetch(av, i, TRUE);
64976498
sv_setsv_mg(begin, sv);
64986499
}
64996500
continue;
65006501
}
6501-
else if (!av_exists(av, j)) {
6502+
else if (!exists_j) {
65026503
SV *sv = av_delete(av, i, 0);
65036504
end = *av_fetch(av, j, TRUE);
65046505
sv_setsv_mg(end, sv);
@@ -6579,18 +6580,20 @@ PP_wrapped(pp_reverse, 0, 1)
65796580
* in a single pass, rather than 2-3 passes. */
65806581

65816582
const char * src = SvPV_const(src_sv, len);
6583+
char* tb;
65826584

65836585
/* Prepare the TARG. */
6586+
SV_CHECK_THINKFIRST_COW_DROP(TARG); /* Drops any buffer */
65846587
if (SvTYPE(TARG) < SVt_PV) {
65856588
SvUPGRADE(TARG, SvTYPE(src_sv)); /* No buffer allocation here */
6586-
} else if(SvTHINKFIRST(TARG)) {
6587-
SV_CHECK_THINKFIRST_COW_DROP(TARG); /* Drops any buffer */
6589+
} else {
6590+
SvSETMAGIC(TARG);
65886591
}
6589-
SvSETMAGIC(TARG);
6590-
SvGROW(TARG, len + 1);
6592+
6593+
tb = SvGROW(TARG, len + 1);
65916594
SvCUR_set(TARG, len);
65926595
SvPOK_only(TARG);
6593-
*SvEND(TARG) = '\0';
6596+
tb[len] = '\0';
65946597
if (SvTAINTED(src_sv))
65956598
SvTAINT(TARG);
65966599

@@ -6614,10 +6617,69 @@ PP_wrapped(pp_reverse, 0, 1)
66146617
}
66156618
}
66166619
} else {
6620+
STRLEN i = 0;
6621+
STRLEN j = len;
6622+
uint32_t u32_1, u32_2;
6623+
uint16_t u16_1, u16_2;
66176624
char * outp= SvPVX(TARG);
6618-
const char *p = src + len;
6619-
while (p != src)
6620-
*outp++ = *--p;
6625+
/* Take a chunk of bytes from the front and from the
6626+
* back, reverse the bytes in each and and swap the
6627+
* chunks over. This should have generally good
6628+
* performance but also is likely to be optimised
6629+
* into bswap instructions by the compiler.
6630+
*/
6631+
#ifdef HAS_QUAD
6632+
uint64_t u64_1, u64_2;
6633+
while (j - i >= 16) {
6634+
memcpy(&u64_1, src + j - 8, 8);
6635+
memcpy(&u64_2, src + i, 8);
6636+
u64_1 = _swab_64_(u64_1);
6637+
u64_2 = _swab_64_(u64_2);
6638+
memcpy(outp + j - 8, &u64_2, 8);
6639+
memcpy(outp + i, &u64_1, 8);
6640+
i += 8;
6641+
j -= 8;
6642+
}
6643+
6644+
if (j - i >= 8) {
6645+
memcpy(&u32_1, src + j - 4, 4);
6646+
memcpy(&u32_2, src + i, 4);
6647+
u32_1 = _swab_32_(u32_1);
6648+
u32_2 = _swab_32_(u32_2);
6649+
memcpy(outp + j - 4, &u32_2, 4);
6650+
memcpy(outp + i, &u32_1, 4);
6651+
i += 4;
6652+
j -= 4;
6653+
}
6654+
#else
6655+
while (j - i >= 8) {
6656+
memcpy(&u32_1, src + j - 4, 4);
6657+
memcpy(&u32_2, src + i, 4);
6658+
u32_1 = _swab_32_(u32_1);
6659+
u32_2 = _swab_32_(u32_2);
6660+
memcpy(outp + j - 4, &u32_2, 4);
6661+
memcpy(outp + i, &u32_1, 4);
6662+
i += 4;
6663+
j -= 4;
6664+
}
6665+
#endif
6666+
if (j - i >= 4) {
6667+
memcpy(&u16_1, src + j - 2, 2);
6668+
memcpy(&u16_2, src + i, 2);
6669+
u16_1 = _swab_16_(u16_1);
6670+
u16_2 = _swab_16_(u16_2);
6671+
memcpy(outp + j - 2, &u16_2, 2);
6672+
memcpy(outp + i, &u16_1, 2);
6673+
i += 2;
6674+
j -= 2;
6675+
}
6676+
6677+
/* Swap any remaining bytes one by one. */
6678+
while (i < j) {
6679+
outp[i] = src[j - 1];
6680+
outp[j - 1] = src[i];
6681+
i++; j--;
6682+
}
66216683
}
66226684
RETURN;
66236685
}
@@ -6630,8 +6692,8 @@ PP_wrapped(pp_reverse, 0, 1)
66306692

66316693
if (len > 1) {
66326694
/* The traditional way, operate on the current byte buffer */
6633-
char *down;
66346695
if (DO_UTF8(TARG)) { /* first reverse each character */
6696+
char *down;
66356697
U8* s = (U8*)SvPVX(TARG);
66366698
const U8* send = (U8*)(s + len);
66376699
while (s < send) {
@@ -6656,11 +6718,64 @@ PP_wrapped(pp_reverse, 0, 1)
66566718
}
66576719
up = SvPVX(TARG);
66586720
}
6659-
down = SvPVX(TARG) + len - 1;
6660-
while (down > up) {
6661-
const char tmp = *up;
6662-
*up++ = *down;
6663-
*down-- = tmp;
6721+
STRLEN i = 0;
6722+
STRLEN j = len;
6723+
uint32_t u32_1, u32_2;
6724+
uint16_t u16_1, u16_2;
6725+
/* Reverse the buffer in place, in chunks where possible */
6726+
#ifdef HAS_QUAD
6727+
uint64_t u64_1, u64_2;
6728+
while (j - i >= 16) {
6729+
memcpy(&u64_1, up + j - 8, 8);
6730+
memcpy(&u64_2, up + i, 8);
6731+
u64_1 = _swab_64_(u64_1);
6732+
u64_2 = _swab_64_(u64_2);
6733+
memcpy(up + j - 8, &u64_2, 8);
6734+
memcpy(up + i, &u64_1, 8);
6735+
i += 8;
6736+
j -= 8;
6737+
}
6738+
6739+
if (j - i >= 8) {
6740+
memcpy(&u32_1, up + j - 4, 4);
6741+
memcpy(&u32_2, up + i, 4);
6742+
u32_1 = _swab_32_(u32_1);
6743+
u32_2 = _swab_32_(u32_2);
6744+
memcpy(up + j - 4, &u32_2, 4);
6745+
memcpy(up + i, &u32_1, 4);
6746+
i += 4;
6747+
j -= 4;
6748+
}
6749+
#else
6750+
while (j - i >= 8) {
6751+
memcpy(&u32_1, up + j - 4, 4);
6752+
memcpy(&u32_2, up + i, 4);
6753+
u32_1 = _swab_32_(u32_1);
6754+
u32_2 = _swab_32_(u32_2);
6755+
memcpy(up + j - 4, &u32_2, 4);
6756+
memcpy(up + i, &u32_1, 4);
6757+
i += 4;
6758+
j -= 4;
6759+
}
6760+
#endif
6761+
if (j - i >= 4) {
6762+
memcpy(&u16_1, up + j - 2, 2);
6763+
memcpy(&u16_2, up + i, 2);
6764+
u16_1 = _swab_16_(u16_1);
6765+
u16_2 = _swab_16_(u16_2);
6766+
memcpy(up + j - 2, &u16_2, 2);
6767+
memcpy(up + i, &u16_1, 2);
6768+
i += 2;
6769+
j -= 2;
6770+
}
6771+
6772+
/* Finally, swap any remaining bytes one-by-one. */
6773+
while (i < j) {
6774+
unsigned char tmp = up[i];
6775+
up[i] = up[j - 1];
6776+
up[j - 1] = tmp;
6777+
i++;
6778+
j--;
66646779
}
66656780
}
66666781
(void)SvPOK_only_UTF8(TARG);

0 commit comments

Comments
 (0)