Skip to content

Commit

Permalink
x86/csum: Improve performance of csum_partial
Browse files Browse the repository at this point in the history
1) Add special case for len == 40 as that is the hottest value. The
   nets a ~8-9% latency improvement and a ~30% throughput improvement
   in the len == 40 case.

2) Use multiple accumulators in the 64-byte loop. This dramatically
   improves ILP and results in up to a 40% latency/throughput
   improvement (better for more iterations).

Results from benchmarking on Icelake. Times measured with rdtsc()
 len   lat_new   lat_old      r    tput_new  tput_old      r
   8      3.58      3.47  1.032        3.58      3.51  1.021
  16      4.14      4.02  1.028        3.96      3.78  1.046
  24      4.99      5.03  0.992        4.23      4.03  1.050
  32      5.09      5.08  1.001        4.68      4.47  1.048
  40      5.57      6.08  0.916        3.05      4.43  0.690
  48      6.65      6.63  1.003        4.97      4.69  1.059
  56      7.74      7.72  1.003        5.22      4.95  1.055
  64      6.65      7.22  0.921        6.38      6.42  0.994
  96      9.43      9.96  0.946        7.46      7.54  0.990
 128      9.39     12.15  0.773        8.90      8.79  1.012
 200     12.65     18.08  0.699       11.63     11.60  1.002
 272     15.82     23.37  0.677       14.43     14.35  1.005
 440     24.12     36.43  0.662       21.57     22.69  0.951
 952     46.20     74.01  0.624       42.98     53.12  0.809
1024     47.12     78.24  0.602       46.36     58.83  0.788
1552     72.01    117.30  0.614       71.92     96.78  0.743
2048     93.07    153.25  0.607       93.28    137.20  0.680
2600    114.73    194.30  0.590      114.28    179.32  0.637
3608    156.34    268.41  0.582      154.97    254.02  0.610
4096    175.01    304.03  0.576      175.89    292.08  0.602

There is no such thing as a free lunch, however, and the special case
for len == 40 does add overhead to the len != 40 cases. This seems to
amount to be ~5% throughput and slightly less in terms of latency.

Testing:
Part of this change is a new kunit test. The tests check all
alignment X length pairs in [0, 64) X [0, 512).
There are three cases.
    1) Precomputed random inputs/seed. The expected results where
       generated use the generic implementation (which is assumed to be
       non-buggy).
    2) An input of all 1s. The goal of this test is to catch any case
       a carry is missing.
    3) An input that never carries. The goal of this test si to catch
       any case of incorrectly carrying.

More exhaustive tests that test all alignment X length pairs in
[0, 8192) X [0, 8192] on random data are also available here:
https://github.com/goldsteinn/csum-reproduction

The reposity also has the code for reproducing the above benchmark
numbers.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/all/20230511011002.935690-1-goldstein.w.n%40gmail.com
  • Loading branch information
goldsteinn authored and hansendc committed May 25, 2023
1 parent b2ad431 commit 688eb81
Show file tree
Hide file tree
Showing 4 changed files with 417 additions and 32 deletions.
97 changes: 65 additions & 32 deletions arch/x86/lib/csum-partial_64.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,32 @@
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed.
*/

#include <linux/compiler.h>
#include <linux/export.h>
#include <asm/checksum.h>
#include <asm/word-at-a-time.h>

static inline unsigned short from32to16(unsigned a)
static inline unsigned short from32to16(unsigned a)
{
unsigned short b = a >> 16;
unsigned short b = a >> 16;
asm("addw %w2,%w0\n\t"
"adcw $0,%w0\n"
"adcw $0,%w0\n"
: "=r" (b)
: "0" (b), "r" (a));
return b;
}

static inline __wsum csum_tail(unsigned int result, u64 temp64, int odd)
{
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
return (__force __wsum)result;
}

/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
Expand All @@ -47,21 +57,52 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
buff++;
}

while (unlikely(len >= 64)) {
/*
* len == 40 is the hot case due to IPv6 headers, but annotating it likely()
* has noticeable negative affect on codegen for all other cases with
* minimal performance benefit here.
*/
if (len == 40) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq 2*8(%[src]),%[res]\n\t"
"adcq 3*8(%[src]),%[res]\n\t"
"adcq 4*8(%[src]),%[res]\n\t"
"adcq 5*8(%[src]),%[res]\n\t"
"adcq 6*8(%[src]),%[res]\n\t"
"adcq 7*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r" (temp64)
: [src] "r" (buff)
: "memory");
buff += 64;
len -= 64;
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[40])buff));
return csum_tail(result, temp64, odd);
}
if (unlikely(len >= 64)) {
/*
* Extra accumulators for better ILP in the loop.
*/
u64 tmp_accum, tmp_carries;

asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
"xorl %k[tmp_carries],%k[tmp_carries]\n\t"
"subl $64, %[len]\n\t"
"1:\n\t"
"addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq 2*8(%[src]),%[res]\n\t"
"adcq 3*8(%[src]),%[res]\n\t"
"adcl $0,%k[tmp_carries]\n\t"
"addq 4*8(%[src]),%[tmp_accum]\n\t"
"adcq 5*8(%[src]),%[tmp_accum]\n\t"
"adcq 6*8(%[src]),%[tmp_accum]\n\t"
"adcq 7*8(%[src]),%[tmp_accum]\n\t"
"adcl $0,%k[tmp_carries]\n\t"
"addq $64, %[src]\n\t"
"subl $64, %[len]\n\t"
"jge 1b\n\t"
"addq %[tmp_accum],%[res]\n\t"
"adcq %[tmp_carries],%[res]\n\t"
"adcq $0,%[res]"
: [tmp_accum] "=&r"(tmp_accum),
[tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
[len] "+r"(len), [src] "+r"(buff)
: "m"(*(const char *)buff));
}

if (len & 32) {
Expand All @@ -70,45 +111,37 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
"adcq 2*8(%[src]),%[res]\n\t"
"adcq 3*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r" (temp64)
: [src] "r" (buff)
: "memory");
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[32])buff));
buff += 32;
}
if (len & 16) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r" (temp64)
: [src] "r" (buff)
: "memory");
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[16])buff));
buff += 16;
}
if (len & 8) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r" (temp64)
: [src] "r" (buff)
: "memory");
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[8])buff));
buff += 8;
}
if (len & 7) {
unsigned int shift = (8 - (len & 7)) * 8;
unsigned int shift = (-len << 3) & 63;
unsigned long trail;

trail = (load_unaligned_zeropad(buff) << shift) >> shift;

asm("addq %[trail],%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r" (temp64)
: [trail] "r" (trail));
: [res] "+r"(temp64)
: [trail] "r"(trail));
}
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
return (__force __wsum)result;
return csum_tail(result, temp64, odd);
}
EXPORT_SYMBOL(csum_partial);

Expand All @@ -118,6 +151,6 @@ EXPORT_SYMBOL(csum_partial);
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff,len,0));
return csum_fold(csum_partial(buff, len, 0));
}
EXPORT_SYMBOL(ip_compute_csum);
17 changes: 17 additions & 0 deletions lib/Kconfig.debug
Original file line number Diff line number Diff line change
Expand Up @@ -2453,6 +2453,23 @@ config BITFIELD_KUNIT

If unsure, say N.

config CHECKSUM_KUNIT
tristate "KUnit test checksum functions at runtime" if !KUNIT_ALL_TESTS
depends on KUNIT
default KUNIT_ALL_TESTS
help
Enable this option to test the checksum functions at boot.

KUnit tests run during boot and output the results to the debug log
in TAP format (http://testanything.org/). Only useful for kernel devs
running the KUnit test harness, and not intended for inclusion into a
production build.

For more information on KUnit and unit tests in general please refer
to the KUnit documentation in Documentation/dev-tools/kunit/.

If unsure, say N.

config HASH_KUNIT_TEST
tristate "KUnit Test for integer hash functions" if !KUNIT_ALL_TESTS
depends on KUNIT
Expand Down
1 change: 1 addition & 0 deletions lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ obj-$(CONFIG_PLDMFW) += pldmfw/
# KUnit tests
CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o
obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o
obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o
obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o
Expand Down
Loading

0 comments on commit 688eb81

Please sign in to comment.