@@ -1712,6 +1712,359 @@ void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register le
17121712 addi (buf, buf, N*4 );
17131713 }
17141714}
1715+
1716+ void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16 (VectorRegister vx, VectorRegister vt,
1717+ VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1718+ Register buf, Register tmp, const int STEP) {
1719+ assert_different_registers (vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1720+ vclmul_vv (vtmp1, vx, vt);
1721+ vclmulh_vv (vtmp2, vx, vt);
1722+ vle64_v (vtmp4, buf); addi (buf, buf, STEP);
1723+ // low parts
1724+ vredxor_vs (vtmp3, vtmp1, vtmp4);
1725+ // high parts
1726+ vslidedown_vi (vx, vtmp4, 1 );
1727+ vredxor_vs (vtmp1, vtmp2, vx);
1728+ // merge low and high back
1729+ vslideup_vi (vx, vtmp1, 1 );
1730+ vmv_x_s (tmp, vtmp3);
1731+ vmv_s_x (vx, tmp);
1732+ }
1733+
1734+ void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2 (VectorRegister vx, VectorRegister vy, VectorRegister vt,
1735+ VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1736+ Register tmp) {
1737+ assert_different_registers (vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1738+ vclmul_vv (vtmp1, vx, vt);
1739+ vclmulh_vv (vtmp2, vx, vt);
1740+ // low parts
1741+ vredxor_vs (vtmp3, vtmp1, vy);
1742+ // high parts
1743+ vslidedown_vi (vtmp4, vy, 1 );
1744+ vredxor_vs (vtmp1, vtmp2, vtmp4);
1745+ // merge low and high back
1746+ vslideup_vi (vx, vtmp1, 1 );
1747+ vmv_x_s (tmp, vtmp3);
1748+ vmv_s_x (vx, tmp);
1749+ }
1750+
1751+ void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3 (VectorRegister vx, VectorRegister vy, VectorRegister vt,
1752+ VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1753+ Register tmp) {
1754+ assert_different_registers (vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1755+ vclmul_vv (vtmp1, vx, vt);
1756+ vclmulh_vv (vtmp2, vx, vt);
1757+ // low parts
1758+ vredxor_vs (vtmp3, vtmp1, vy);
1759+ // high parts
1760+ vslidedown_vi (vtmp4, vy, 1 );
1761+ vredxor_vs (vtmp1, vtmp2, vtmp4);
1762+ // merge low and high back
1763+ vslideup_vi (vy, vtmp1, 1 );
1764+ vmv_x_s (tmp, vtmp3);
1765+ vmv_s_x (vy, tmp);
1766+ }
1767+
1768+ void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16 (Register crc, Register buf, Register len,
1769+ Register vclmul_table, Register tmp1, Register tmp2) {
1770+ assert_different_registers (crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1771+ assert (MaxVectorSize == 16 , " sanity" );
1772+
1773+ const int TABLE_STEP = 16 ;
1774+ const int STEP = 16 ;
1775+ const int LOOP_STEP = 128 ;
1776+ const int N = 2 ;
1777+
1778+ Register loop_step = t1;
1779+
1780+ // ======== preparation ========
1781+
1782+ mv (loop_step, LOOP_STEP);
1783+ sub (len, len, loop_step);
1784+
1785+ vsetivli (zr, N, Assembler::e64 , Assembler::m1, Assembler::mu, Assembler::tu);
1786+ vle64_v (v0, buf); addi (buf, buf, STEP);
1787+ vle64_v (v1, buf); addi (buf, buf, STEP);
1788+ vle64_v (v2, buf); addi (buf, buf, STEP);
1789+ vle64_v (v3, buf); addi (buf, buf, STEP);
1790+ vle64_v (v4, buf); addi (buf, buf, STEP);
1791+ vle64_v (v5, buf); addi (buf, buf, STEP);
1792+ vle64_v (v6, buf); addi (buf, buf, STEP);
1793+ vle64_v (v7, buf); addi (buf, buf, STEP);
1794+
1795+ vmv_v_x (v31, zr);
1796+ vsetivli (zr, 1 , Assembler::e32 , Assembler::m1, Assembler::mu, Assembler::tu);
1797+ vmv_s_x (v31, crc);
1798+ vsetivli (zr, N, Assembler::e64 , Assembler::m1, Assembler::mu, Assembler::tu);
1799+ vxor_vv (v0, v0, v31);
1800+
1801+ // load table
1802+ vle64_v (v31, vclmul_table);
1803+
1804+ Label L_16_bytes_loop;
1805+ j (L_16_bytes_loop);
1806+
1807+
1808+ // ======== folding 128 bytes in data buffer per round ========
1809+
1810+ align (OptoLoopAlignment);
1811+ bind (L_16_bytes_loop);
1812+ {
1813+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1814+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1815+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1816+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
1817+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
1818+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1819+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1820+ crc32_vclmul_fold_16_bytes_vectorsize_16 (v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1821+ }
1822+ sub (len, len, loop_step);
1823+ bge (len, loop_step, L_16_bytes_loop);
1824+
1825+
1826+ // ======== folding into 64 bytes from 128 bytes in register ========
1827+
1828+ // load table
1829+ addi (vclmul_table, vclmul_table, TABLE_STEP);
1830+ vle64_v (v31, vclmul_table);
1831+
1832+ crc32_vclmul_fold_16_bytes_vectorsize_16_2 (v0, v4, v31, v8, v9, v10, v11, tmp2);
1833+ crc32_vclmul_fold_16_bytes_vectorsize_16_2 (v1, v5, v31, v12, v13, v14, v15, tmp2);
1834+ crc32_vclmul_fold_16_bytes_vectorsize_16_2 (v2, v6, v31, v16, v17, v18, v19, tmp2);
1835+ crc32_vclmul_fold_16_bytes_vectorsize_16_2 (v3, v7, v31, v20, v21, v22, v23, tmp2);
1836+
1837+
1838+ // ======== folding into 16 bytes from 64 bytes in register ========
1839+
1840+ addi (vclmul_table, vclmul_table, TABLE_STEP);
1841+ vle64_v (v31, vclmul_table);
1842+ crc32_vclmul_fold_16_bytes_vectorsize_16_3 (v0, v3, v31, v8, v9, v10, v11, tmp2);
1843+
1844+ addi (vclmul_table, vclmul_table, TABLE_STEP);
1845+ vle64_v (v31, vclmul_table);
1846+ crc32_vclmul_fold_16_bytes_vectorsize_16_3 (v1, v3, v31, v12, v13, v14, v15, tmp2);
1847+
1848+ addi (vclmul_table, vclmul_table, TABLE_STEP);
1849+ vle64_v (v31, vclmul_table);
1850+ crc32_vclmul_fold_16_bytes_vectorsize_16_3 (v2, v3, v31, v16, v17, v18, v19, tmp2);
1851+
1852+ #undef FOLD_2_VCLMUL_3
1853+
1854+
1855+ // ======== final: move result to scalar regsiters ========
1856+
1857+ vmv_x_s (tmp1, v3);
1858+ vslidedown_vi (v1, v3, 1 );
1859+ vmv_x_s (tmp2, v1);
1860+ }
1861+
1862+ void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32 (VectorRegister vx, VectorRegister vy, VectorRegister vt,
1863+ VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
1864+ assert_different_registers (vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1865+ vclmul_vv (vtmp1, vx, vt);
1866+ vclmulh_vv (vtmp2, vx, vt);
1867+ // low parts
1868+ vredxor_vs (vtmp3, vtmp1, vy);
1869+ // high parts
1870+ vslidedown_vi (vtmp4, vy, 1 );
1871+ vredxor_vs (vtmp1, vtmp2, vtmp4);
1872+ // merge low and high back
1873+ vslideup_vi (vy, vtmp1, 1 );
1874+ vmv_x_s (t1, vtmp3);
1875+ vmv_s_x (vy, t1);
1876+ }
1877+
1878+ void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32 (Register crc, Register buf, Register len,
1879+ Register vclmul_table, Register tmp1, Register tmp2) {
1880+ assert_different_registers (crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1881+ assert (MaxVectorSize >= 32 , " sanity" );
1882+
1883+ // utility: load table
1884+ #define CRC32_VCLMUL_LOAD_TABLE (vt, rt, vtmp, rtmp ) \
1885+ vid_v (vtmp); \
1886+ mv (rtmp, 2 ); \
1887+ vremu_vx (vtmp, vtmp, rtmp); \
1888+ vsll_vi (vtmp, vtmp, 3 ); \
1889+ vluxei64_v (vt, rt, vtmp);
1890+
1891+ const int TABLE_STEP = 16 ;
1892+ const int STEP = 128 ; // 128 bytes per round
1893+ const int N = 2 * 8 ; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
1894+
1895+ Register step = tmp2;
1896+
1897+
1898+ // ======== preparation ========
1899+
1900+ mv (step, STEP);
1901+ sub (len, len, step); // 2 rounds of folding with carry-less multiplication
1902+
1903+ vsetivli (zr, N, Assembler::e64 , Assembler::m4, Assembler::mu, Assembler::tu);
1904+ // load data
1905+ vle64_v (v4, buf);
1906+ add (buf, buf, step);
1907+
1908+ // load table
1909+ CRC32_VCLMUL_LOAD_TABLE (v8, vclmul_table, v28, t1);
1910+ // load mask,
1911+ // v28 should already contains: 0, 8, 0, 8, ...
1912+ vmseq_vi (v2, v28, 0 );
1913+ // now, v2 should contains: 101010...
1914+ vmnand_mm (v1, v2, v2);
1915+ // now, v1 should contains: 010101...
1916+
1917+ // initial crc
1918+ vmv_v_x (v24, zr);
1919+ vsetivli (zr, 1 , Assembler::e32 , Assembler::m4, Assembler::mu, Assembler::tu);
1920+ vmv_s_x (v24, crc);
1921+ vsetivli (zr, N, Assembler::e64 , Assembler::m4, Assembler::mu, Assembler::tu);
1922+ vxor_vv (v4, v4, v24);
1923+
1924+ Label L_128_bytes_loop;
1925+ j (L_128_bytes_loop);
1926+
1927+
1928+ // ======== folding 128 bytes in data buffer per round ========
1929+
1930+ align (OptoLoopAlignment);
1931+ bind (L_128_bytes_loop);
1932+ {
1933+ // v4: data
1934+ // v4: buf, reused
1935+ // v8: table
1936+ // v12: lows
1937+ // v16: highs
1938+ // v20: low_slides
1939+ // v24: high_slides
1940+ vclmul_vv (v12, v4, v8);
1941+ vclmulh_vv (v16, v4, v8);
1942+ vle64_v (v4, buf);
1943+ add (buf, buf, step);
1944+ // lows
1945+ vslidedown_vi (v20, v12, 1 );
1946+ vmand_mm (v0, v2, v2);
1947+ vxor_vv (v12, v12, v20, v0_t );
1948+ // with buf data
1949+ vxor_vv (v4, v4, v12, v0_t );
1950+
1951+ // highs
1952+ vslideup_vi (v24, v16, 1 );
1953+ vmand_mm (v0, v1, v1);
1954+ vxor_vv (v16, v16, v24, v0_t );
1955+ // with buf data
1956+ vxor_vv (v4, v4, v16, v0_t );
1957+ }
1958+ sub (len, len, step);
1959+ bge (len, step, L_128_bytes_loop);
1960+
1961+
1962+ // ======== folding into 64 bytes from 128 bytes in register ========
1963+
1964+ // load table
1965+ addi (vclmul_table, vclmul_table, TABLE_STEP);
1966+ CRC32_VCLMUL_LOAD_TABLE (v8, vclmul_table, v28, t1);
1967+
1968+ // v4: data, first (low) part, N/2 of 64-bits
1969+ // v20: data, second (high) part, N/2 of 64-bits
1970+ // v8: table
1971+ // v10: lows
1972+ // v12: highs
1973+ // v14: low_slides
1974+ // v16: high_slides
1975+
1976+ // high part
1977+ vslidedown_vi (v20, v4, N/2 );
1978+
1979+ vsetivli (zr, N/2 , Assembler::e64 , Assembler::m2, Assembler::mu, Assembler::tu);
1980+
1981+ vclmul_vv (v10, v4, v8);
1982+ vclmulh_vv (v12, v4, v8);
1983+
1984+ // lows
1985+ vslidedown_vi (v14, v10, 1 );
1986+ vmand_mm (v0, v2, v2);
1987+ vxor_vv (v10, v10, v14, v0_t );
1988+ // with data part 2
1989+ vxor_vv (v4, v20, v10, v0_t );
1990+
1991+ // highs
1992+ vslideup_vi (v16, v12, 1 );
1993+ vmand_mm (v0, v1, v1);
1994+ vxor_vv (v12, v12, v16, v0_t );
1995+ // with data part 2
1996+ vxor_vv (v4, v20, v12, v0_t );
1997+
1998+
1999+ // ======== folding into 16 bytes from 64 bytes in register ========
2000+
2001+ // v4: data, first part, 2 of 64-bits
2002+ // v16: data, second part, 2 of 64-bits
2003+ // v18: data, third part, 2 of 64-bits
2004+ // v20: data, second part, 2 of 64-bits
2005+ // v8: table
2006+
2007+ vslidedown_vi (v16, v4, 2 );
2008+ vslidedown_vi (v18, v4, 4 );
2009+ vslidedown_vi (v20, v4, 6 );
2010+
2011+ vsetivli (zr, 2 , Assembler::e64 , Assembler::m1, Assembler::mu, Assembler::tu);
2012+
2013+ addi (vclmul_table, vclmul_table, TABLE_STEP);
2014+ vle64_v (v8, vclmul_table);
2015+ crc32_vclmul_fold_to_16_bytes_vectorsize_32 (v4, v20, v8, v28, v29, v30, v31);
2016+
2017+ addi (vclmul_table, vclmul_table, TABLE_STEP);
2018+ vle64_v (v8, vclmul_table);
2019+ crc32_vclmul_fold_to_16_bytes_vectorsize_32 (v16, v20, v8, v28, v29, v30, v31);
2020+
2021+ addi (vclmul_table, vclmul_table, TABLE_STEP);
2022+ vle64_v (v8, vclmul_table);
2023+ crc32_vclmul_fold_to_16_bytes_vectorsize_32 (v18, v20, v8, v28, v29, v30, v31);
2024+
2025+
2026+ // ======== final: move result to scalar regsiters ========
2027+
2028+ vmv_x_s (tmp1, v20);
2029+ vslidedown_vi (v4, v20, 1 );
2030+ vmv_x_s (tmp2, v4);
2031+
2032+ #undef CRC32_VCLMUL_LOAD_TABLE
2033+ }
2034+
2035+ // For more details of the algorithm, please check the paper:
2036+ // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2037+ //
2038+ // Please also refer to the corresponding code in aarch64 or x86 ones.
2039+ //
2040+ // As the riscv carry-less multiplication is a bit different from the other platforms,
2041+ // so the implementation itself is also a bit different from others.
2042+
2043+ void MacroAssembler::kernel_crc32_vclmul_fold (Register crc, Register buf, Register len,
2044+ Register table0, Register table1, Register table2, Register table3,
2045+ Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2046+ const int64_t single_table_size = 256 ;
2047+ const int64_t table_num = 8 ; // 4 for scalar, 4 for plain vector
2048+ const ExternalAddress table_addr = StubRoutines::crc_table_addr ();
2049+ Register vclmul_table = tmp3;
2050+
2051+ la (vclmul_table, table_addr);
2052+ add (vclmul_table, vclmul_table, table_num*single_table_size*sizeof (juint), tmp1);
2053+ la (table0, table_addr);
2054+
2055+ if (MaxVectorSize == 16 ) {
2056+ kernel_crc32_vclmul_fold_vectorsize_16 (crc, buf, len, vclmul_table, tmp1, tmp2);
2057+ } else {
2058+ kernel_crc32_vclmul_fold_vectorsize_32 (crc, buf, len, vclmul_table, tmp1, tmp2);
2059+ }
2060+
2061+ mv (crc, zr);
2062+ update_word_crc32 (crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false );
2063+ update_word_crc32 (crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true );
2064+ update_word_crc32 (crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false );
2065+ update_word_crc32 (crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true );
2066+ }
2067+
17152068#endif // COMPILER2
17162069
17172070/* *
@@ -1765,7 +2118,9 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
17652118
17662119#ifdef COMPILER2
17672120 if (UseRVV) {
1768- const int64_t tmp_limit = MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5 ;
2121+ const int64_t tmp_limit =
2122+ UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2123+ : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5 ;
17692124 mv (tmp1, tmp_limit);
17702125 bge (len, tmp1, L_vector_entry);
17712126 }
@@ -1827,7 +2182,13 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
18272182 j (L_exit);
18282183
18292184 bind (L_vector_entry);
1830- vector_update_crc32 (crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2185+ if (UseZvbc) { // carry-less multiplication
2186+ kernel_crc32_vclmul_fold (crc, buf, len,
2187+ table0, table1, table2, table3,
2188+ tmp1, tmp2, tmp3, tmp4, tmp6);
2189+ } else { // plain vector instructions
2190+ vector_update_crc32 (crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2191+ }
18312192
18322193 bgtz (len, L_by4_loop_entry);
18332194 }
0 commit comments