Skip to content

Commit c517ffb

Browse files
author
Hamlin Li
committed
8339910: RISC-V: crc32 intrinsic with carry-less multiplication
Reviewed-by: rehn, luhenry
1 parent e0d6398 commit c517ffb

File tree

7 files changed

+407
-3
lines changed

7 files changed

+407
-3
lines changed

src/hotspot/cpu/riscv/globals_riscv.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ define_pd_global(intx, InlineSmallCode, 1000);
115115
"Use Zihintpause instructions") \
116116
product(bool, UseZtso, false, EXPERIMENTAL, "Assume Ztso memory model") \
117117
product(bool, UseZvbb, false, EXPERIMENTAL, "Use Zvbb instructions") \
118+
product(bool, UseZvbc, false, EXPERIMENTAL, "Use Zvbc instructions") \
118119
product(bool, UseZvfh, false, DIAGNOSTIC, "Use Zvfh instructions") \
119120
product(bool, UseZvkn, false, EXPERIMENTAL, \
120121
"Use Zvkn group extension, Zvkned, Zvknhb, Zvkb, Zvkt") \

src/hotspot/cpu/riscv/macroAssembler_riscv.cpp

Lines changed: 363 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,6 +1712,359 @@ void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register le
17121712
addi(buf, buf, N*4);
17131713
}
17141714
}
1715+
1716+
void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
1717+
VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1718+
Register buf, Register tmp, const int STEP) {
1719+
assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1720+
vclmul_vv(vtmp1, vx, vt);
1721+
vclmulh_vv(vtmp2, vx, vt);
1722+
vle64_v(vtmp4, buf); addi(buf, buf, STEP);
1723+
// low parts
1724+
vredxor_vs(vtmp3, vtmp1, vtmp4);
1725+
// high parts
1726+
vslidedown_vi(vx, vtmp4, 1);
1727+
vredxor_vs(vtmp1, vtmp2, vx);
1728+
// merge low and high back
1729+
vslideup_vi(vx, vtmp1, 1);
1730+
vmv_x_s(tmp, vtmp3);
1731+
vmv_s_x(vx, tmp);
1732+
}
1733+
1734+
void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1735+
VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1736+
Register tmp) {
1737+
assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1738+
vclmul_vv(vtmp1, vx, vt);
1739+
vclmulh_vv(vtmp2, vx, vt);
1740+
// low parts
1741+
vredxor_vs(vtmp3, vtmp1, vy);
1742+
// high parts
1743+
vslidedown_vi(vtmp4, vy, 1);
1744+
vredxor_vs(vtmp1, vtmp2, vtmp4);
1745+
// merge low and high back
1746+
vslideup_vi(vx, vtmp1, 1);
1747+
vmv_x_s(tmp, vtmp3);
1748+
vmv_s_x(vx, tmp);
1749+
}
1750+
1751+
void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1752+
VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1753+
Register tmp) {
1754+
assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1755+
vclmul_vv(vtmp1, vx, vt);
1756+
vclmulh_vv(vtmp2, vx, vt);
1757+
// low parts
1758+
vredxor_vs(vtmp3, vtmp1, vy);
1759+
// high parts
1760+
vslidedown_vi(vtmp4, vy, 1);
1761+
vredxor_vs(vtmp1, vtmp2, vtmp4);
1762+
// merge low and high back
1763+
vslideup_vi(vy, vtmp1, 1);
1764+
vmv_x_s(tmp, vtmp3);
1765+
vmv_s_x(vy, tmp);
1766+
}
1767+
1768+
void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
1769+
Register vclmul_table, Register tmp1, Register tmp2) {
1770+
assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1771+
assert(MaxVectorSize == 16, "sanity");
1772+
1773+
const int TABLE_STEP = 16;
1774+
const int STEP = 16;
1775+
const int LOOP_STEP = 128;
1776+
const int N = 2;
1777+
1778+
Register loop_step = t1;
1779+
1780+
// ======== preparation ========
1781+
1782+
mv(loop_step, LOOP_STEP);
1783+
sub(len, len, loop_step);
1784+
1785+
vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1786+
vle64_v(v0, buf); addi(buf, buf, STEP);
1787+
vle64_v(v1, buf); addi(buf, buf, STEP);
1788+
vle64_v(v2, buf); addi(buf, buf, STEP);
1789+
vle64_v(v3, buf); addi(buf, buf, STEP);
1790+
vle64_v(v4, buf); addi(buf, buf, STEP);
1791+
vle64_v(v5, buf); addi(buf, buf, STEP);
1792+
vle64_v(v6, buf); addi(buf, buf, STEP);
1793+
vle64_v(v7, buf); addi(buf, buf, STEP);
1794+
1795+
vmv_v_x(v31, zr);
1796+
vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
1797+
vmv_s_x(v31, crc);
1798+
vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1799+
vxor_vv(v0, v0, v31);
1800+
1801+
// load table
1802+
vle64_v(v31, vclmul_table);
1803+
1804+
Label L_16_bytes_loop;
1805+
j(L_16_bytes_loop);
1806+
1807+
1808+
// ======== folding 128 bytes in data buffer per round ========
1809+
1810+
align(OptoLoopAlignment);
1811+
bind(L_16_bytes_loop);
1812+
{
1813+
crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1814+
crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1815+
crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1816+
crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
1817+
crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
1818+
crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1819+
crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1820+
crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1821+
}
1822+
sub(len, len, loop_step);
1823+
bge(len, loop_step, L_16_bytes_loop);
1824+
1825+
1826+
// ======== folding into 64 bytes from 128 bytes in register ========
1827+
1828+
// load table
1829+
addi(vclmul_table, vclmul_table, TABLE_STEP);
1830+
vle64_v(v31, vclmul_table);
1831+
1832+
crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
1833+
crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
1834+
crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
1835+
crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
1836+
1837+
1838+
// ======== folding into 16 bytes from 64 bytes in register ========
1839+
1840+
addi(vclmul_table, vclmul_table, TABLE_STEP);
1841+
vle64_v(v31, vclmul_table);
1842+
crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
1843+
1844+
addi(vclmul_table, vclmul_table, TABLE_STEP);
1845+
vle64_v(v31, vclmul_table);
1846+
crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
1847+
1848+
addi(vclmul_table, vclmul_table, TABLE_STEP);
1849+
vle64_v(v31, vclmul_table);
1850+
crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
1851+
1852+
#undef FOLD_2_VCLMUL_3
1853+
1854+
1855+
// ======== final: move result to scalar regsiters ========
1856+
1857+
vmv_x_s(tmp1, v3);
1858+
vslidedown_vi(v1, v3, 1);
1859+
vmv_x_s(tmp2, v1);
1860+
}
1861+
1862+
void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1863+
VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
1864+
assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1865+
vclmul_vv(vtmp1, vx, vt);
1866+
vclmulh_vv(vtmp2, vx, vt);
1867+
// low parts
1868+
vredxor_vs(vtmp3, vtmp1, vy);
1869+
// high parts
1870+
vslidedown_vi(vtmp4, vy, 1);
1871+
vredxor_vs(vtmp1, vtmp2, vtmp4);
1872+
// merge low and high back
1873+
vslideup_vi(vy, vtmp1, 1);
1874+
vmv_x_s(t1, vtmp3);
1875+
vmv_s_x(vy, t1);
1876+
}
1877+
1878+
void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
1879+
Register vclmul_table, Register tmp1, Register tmp2) {
1880+
assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1881+
assert(MaxVectorSize >= 32, "sanity");
1882+
1883+
// utility: load table
1884+
#define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
1885+
vid_v(vtmp); \
1886+
mv(rtmp, 2); \
1887+
vremu_vx(vtmp, vtmp, rtmp); \
1888+
vsll_vi(vtmp, vtmp, 3); \
1889+
vluxei64_v(vt, rt, vtmp);
1890+
1891+
const int TABLE_STEP = 16;
1892+
const int STEP = 128; // 128 bytes per round
1893+
const int N = 2 * 8; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
1894+
1895+
Register step = tmp2;
1896+
1897+
1898+
// ======== preparation ========
1899+
1900+
mv(step, STEP);
1901+
sub(len, len, step); // 2 rounds of folding with carry-less multiplication
1902+
1903+
vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
1904+
// load data
1905+
vle64_v(v4, buf);
1906+
add(buf, buf, step);
1907+
1908+
// load table
1909+
CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
1910+
// load mask,
1911+
// v28 should already contains: 0, 8, 0, 8, ...
1912+
vmseq_vi(v2, v28, 0);
1913+
// now, v2 should contains: 101010...
1914+
vmnand_mm(v1, v2, v2);
1915+
// now, v1 should contains: 010101...
1916+
1917+
// initial crc
1918+
vmv_v_x(v24, zr);
1919+
vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
1920+
vmv_s_x(v24, crc);
1921+
vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
1922+
vxor_vv(v4, v4, v24);
1923+
1924+
Label L_128_bytes_loop;
1925+
j(L_128_bytes_loop);
1926+
1927+
1928+
// ======== folding 128 bytes in data buffer per round ========
1929+
1930+
align(OptoLoopAlignment);
1931+
bind(L_128_bytes_loop);
1932+
{
1933+
// v4: data
1934+
// v4: buf, reused
1935+
// v8: table
1936+
// v12: lows
1937+
// v16: highs
1938+
// v20: low_slides
1939+
// v24: high_slides
1940+
vclmul_vv(v12, v4, v8);
1941+
vclmulh_vv(v16, v4, v8);
1942+
vle64_v(v4, buf);
1943+
add(buf, buf, step);
1944+
// lows
1945+
vslidedown_vi(v20, v12, 1);
1946+
vmand_mm(v0, v2, v2);
1947+
vxor_vv(v12, v12, v20, v0_t);
1948+
// with buf data
1949+
vxor_vv(v4, v4, v12, v0_t);
1950+
1951+
// highs
1952+
vslideup_vi(v24, v16, 1);
1953+
vmand_mm(v0, v1, v1);
1954+
vxor_vv(v16, v16, v24, v0_t);
1955+
// with buf data
1956+
vxor_vv(v4, v4, v16, v0_t);
1957+
}
1958+
sub(len, len, step);
1959+
bge(len, step, L_128_bytes_loop);
1960+
1961+
1962+
// ======== folding into 64 bytes from 128 bytes in register ========
1963+
1964+
// load table
1965+
addi(vclmul_table, vclmul_table, TABLE_STEP);
1966+
CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
1967+
1968+
// v4: data, first (low) part, N/2 of 64-bits
1969+
// v20: data, second (high) part, N/2 of 64-bits
1970+
// v8: table
1971+
// v10: lows
1972+
// v12: highs
1973+
// v14: low_slides
1974+
// v16: high_slides
1975+
1976+
// high part
1977+
vslidedown_vi(v20, v4, N/2);
1978+
1979+
vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
1980+
1981+
vclmul_vv(v10, v4, v8);
1982+
vclmulh_vv(v12, v4, v8);
1983+
1984+
// lows
1985+
vslidedown_vi(v14, v10, 1);
1986+
vmand_mm(v0, v2, v2);
1987+
vxor_vv(v10, v10, v14, v0_t);
1988+
// with data part 2
1989+
vxor_vv(v4, v20, v10, v0_t);
1990+
1991+
// highs
1992+
vslideup_vi(v16, v12, 1);
1993+
vmand_mm(v0, v1, v1);
1994+
vxor_vv(v12, v12, v16, v0_t);
1995+
// with data part 2
1996+
vxor_vv(v4, v20, v12, v0_t);
1997+
1998+
1999+
// ======== folding into 16 bytes from 64 bytes in register ========
2000+
2001+
// v4: data, first part, 2 of 64-bits
2002+
// v16: data, second part, 2 of 64-bits
2003+
// v18: data, third part, 2 of 64-bits
2004+
// v20: data, second part, 2 of 64-bits
2005+
// v8: table
2006+
2007+
vslidedown_vi(v16, v4, 2);
2008+
vslidedown_vi(v18, v4, 4);
2009+
vslidedown_vi(v20, v4, 6);
2010+
2011+
vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2012+
2013+
addi(vclmul_table, vclmul_table, TABLE_STEP);
2014+
vle64_v(v8, vclmul_table);
2015+
crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2016+
2017+
addi(vclmul_table, vclmul_table, TABLE_STEP);
2018+
vle64_v(v8, vclmul_table);
2019+
crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2020+
2021+
addi(vclmul_table, vclmul_table, TABLE_STEP);
2022+
vle64_v(v8, vclmul_table);
2023+
crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2024+
2025+
2026+
// ======== final: move result to scalar regsiters ========
2027+
2028+
vmv_x_s(tmp1, v20);
2029+
vslidedown_vi(v4, v20, 1);
2030+
vmv_x_s(tmp2, v4);
2031+
2032+
#undef CRC32_VCLMUL_LOAD_TABLE
2033+
}
2034+
2035+
// For more details of the algorithm, please check the paper:
2036+
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2037+
//
2038+
// Please also refer to the corresponding code in aarch64 or x86 ones.
2039+
//
2040+
// As the riscv carry-less multiplication is a bit different from the other platforms,
2041+
// so the implementation itself is also a bit different from others.
2042+
2043+
void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2044+
Register table0, Register table1, Register table2, Register table3,
2045+
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2046+
const int64_t single_table_size = 256;
2047+
const int64_t table_num = 8; // 4 for scalar, 4 for plain vector
2048+
const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2049+
Register vclmul_table = tmp3;
2050+
2051+
la(vclmul_table, table_addr);
2052+
add(vclmul_table, vclmul_table, table_num*single_table_size*sizeof(juint), tmp1);
2053+
la(table0, table_addr);
2054+
2055+
if (MaxVectorSize == 16) {
2056+
kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2057+
} else {
2058+
kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2059+
}
2060+
2061+
mv(crc, zr);
2062+
update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2063+
update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2064+
update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2065+
update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2066+
}
2067+
17152068
#endif // COMPILER2
17162069

17172070
/**
@@ -1765,7 +2118,9 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
17652118

17662119
#ifdef COMPILER2
17672120
if (UseRVV) {
1768-
const int64_t tmp_limit = MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2121+
const int64_t tmp_limit =
2122+
UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2123+
: MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
17692124
mv(tmp1, tmp_limit);
17702125
bge(len, tmp1, L_vector_entry);
17712126
}
@@ -1827,7 +2182,13 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
18272182
j(L_exit);
18282183

18292184
bind(L_vector_entry);
1830-
vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2185+
if (UseZvbc) { // carry-less multiplication
2186+
kernel_crc32_vclmul_fold(crc, buf, len,
2187+
table0, table1, table2, table3,
2188+
tmp1, tmp2, tmp3, tmp4, tmp6);
2189+
} else { // plain vector instructions
2190+
vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2191+
}
18312192

18322193
bgtz(len, L_by4_loop_entry);
18332194
}

0 commit comments

Comments
 (0)