Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8322179: RISC-V: Implement SHA-1 intrinsic #17130

Closed
wants to merge 15 commits into from
347 changes: 347 additions & 0 deletions src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4809,6 +4809,348 @@ class StubGenerator: public StubCodeGenerator {
return (address) start;
}


// ------------------------ SHA-1 intrinsic ------------------------

// K't =
// 5a827999, 0 <= t <= 19
// 6ed9eba1, 20 <= t <= 39
// 8f1bbcdc, 40 <= t <= 59
// ca62c1d6, 60 <= t <= 79
void sha1_prepare_k(int round, Register cur_k) {
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
assert(round >= 0 && round < 80, "must be");

static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
if ((round % 20) == 0) {
__ mv(cur_k, ks[round/20]);
}
}

// W't =
// M't, 0 <= t <= 15
// ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
void sha1_prepare_w(int round, Register cur_w, Register ws[], Register buf) {
assert(round >= 0 && round < 80, "must be");

if (round < 16) {
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
if ((round % 2) == 0) {
__ ld(ws[round/2], Address(buf, round * 4));
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
// reverse bytes, as SHA-1 is defined in big-endian.
__ revb(ws[round/2], ws[round/2]);
__ srli(cur_w, ws[round/2], 32);
} else {
__ mv(cur_w, ws[round/2]);
}

return;
}

if (round == 16) {
int64_t block_bytes = round * 4;
__ addi(buf, buf, block_bytes);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does buf need to be incremented here? And why at round == 16 and not after all the rounds are done? Maybe you can do this in the loop in generate_sha1_implCompress to have the code that initializes and increments it in the same function? Also, does it need to be incremented if !multi_block?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, Good catch!

And other comments are also resolved.

}

if ((round % 2) == 0) {
int idx = 16;
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
__ mv(t0, ws[(idx-3)/2]);
__ srli(t1, ws[(idx-8)/2], 32);
__ xorr(t0, t0, t1);

__ srli(t1, ws[(idx-14)/2], 32);
__ srli(cur_w, ws[(idx-16)/2], 32);
__ xorr(cur_w, cur_w, t1);

__ xorr(cur_w, cur_w, t0);
__ rolw_imm(cur_w, cur_w, 1, t0);

// copy the cur_w value to ws[8].
// now, valid w't values are at:
// w0: ws[0]'s lower 32 bits
// w1 ~ w14: ws[1] ~ ws[7]
// w15: ws[8]'s higher 32 bits
__ slli(ws[idx/2], cur_w, 32);

return;
}

int idx = 17;
// W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
__ srli(t0, ws[(idx-3)/2], 32);
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
__ mv(t1, ws[(idx-8)/2]);
__ xorr(t0, t0, t1);

__ mv(t1, ws[(idx-14)/2]);
__ mv(cur_w, ws[(idx-16)/2]);
__ xorr(cur_w, cur_w, t1);

__ xorr(cur_w, cur_w, t0);
__ rolw_imm(cur_w, cur_w, 1, t0);

// copy the cur_w value to ws[8]
__ zero_extend(cur_w, cur_w, 32);
__ orr(ws[idx/2], ws[idx/2], cur_w);

// shift the w't registers, so they start from ws[0] again.
// now, valid w't values are at:
// w0 ~ w15: ws[0] ~ ws[7]
Register ws_0 = ws[0];
for (int i = 0; i < 16/2; i++) {
ws[i] = ws[i+1];
}
ws[8] = ws_0;
}

// f't(x, y, z) =
// Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
// Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
// Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
// Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
void sha1_f(int round, Register dst, Register x, Register y, Register z) {
assert(round >= 0 && round < 80, "must be");
assert_different_registers(dst, x, y, z);

if (round < 20) {
// (x & y) ^ (~x & z)
__ andr(t0, x, y);
__ andn(dst, z, x);
__ xorr(dst, dst, t0);
} else if (round >= 40 && round < 60) {
// (x & y) ^ (x & z) ^ (y & z)
__ andr(t0, x, y);
__ andr(t1, x, z);
__ andr(dst, y, z);
__ xorr(dst, dst, t0);
__ xorr(dst, dst, t1);
} else {
// x ^ y ^ z
__ xorr(dst, x, y);
__ xorr(dst, dst, z);
}
}

// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
// e = d
// d = c
// c = ROTL'30(b)
// b = a
// a = T
void sha1_process_round(int round, Register a, Register b, Register c, Register d, Register e,
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
Register cur_k, Register cur_w, Register tmp) {
assert(round >= 0 && round < 80, "must be");
assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp);

// T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
{
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
// reuse e as a temporary register, as we will mv new value into it later
Register t = e;
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
// cur_w will be recalculated at the beginning of each round,
// so, we can reuse it as a temp register here.
__ add(cur_w, cur_k, cur_w);
__ add(t, t, cur_w);
__ rolw_imm(cur_w, a, 5, t0);
// as pointed above, we can use cur_w as temporary register here.
sha1_f(round, tmp, b, c, d);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we also reuse cur_k in this function where input param tmp is used? I see cur_k will also be recalculated at the beginning of each round like cur_w. Hope this could help eliminate tmp param and finally free t2.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems not, as cur_k is only calculated when (round % 20) == 0.

Other comments are all resolved.

Copy link
Member

@RealFYang RealFYang Feb 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Could you please move int round as the last formal param for other assembler functions so that it will be more consistent? They are sha1_prepare_w, sha1_f, and sha1_process_round. BTW: I think it will be safer to add scratch register t0, t1 to the list for assert_different_registers if they are used in those assembler functions.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, it's updated. Thanks.

__ add(cur_w, cur_w, tmp);
__ add(cur_w, cur_w, t);
}

// e = d
// d = c
// c = ROTL'30(b)
// b = a
// a = T
__ mv(e, d);
__ mv(d, c);
// as pointed above, we can use cur_w as temporary register here.
__ rolw_imm(c, b, 30);
__ mv(b, a);
__ mv(a, cur_w);
}

// H(i)0 = a + H(i-1)0
// H(i)1 = b + H(i-1)1
// H(i)2 = c + H(i-1)2
// H(i)3 = d + H(i-1)3
// H(i)4 = e + H(i-1)4
void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
Register prev_ab, Register prev_cd, Register prev_e) {
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);

__ add(a, a, prev_ab);
__ srli(prev_ab, prev_ab, 32);
__ add(b, b, prev_ab);

__ add(c, c, prev_cd);
__ srli(prev_cd, prev_cd, 32);
__ add(d, d, prev_cd);

__ add(e, e, prev_e);
}

void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
Register prev_ab, Register prev_cd, Register prev_e) {
assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);

__ slli(t0, b, 32);
__ zero_extend(prev_ab, a, 32);
__ orr(prev_ab, prev_ab, t0);

__ slli(t0, d, 32);
__ zero_extend(prev_cd, c, 32);
__ orr(prev_cd, prev_cd, t0);

__ mv(prev_e, e);
}

// Intrinsic for:
// void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
// void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
//
// Arguments:
//
// Inputs:
// c_rarg0: byte[] src array + offset
// c_rarg1: int[] SHA.state
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
// c_rarg2: int offset
// c_rarg3: int limit
//
// Outpus:
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
// - - - - - - below are only for implCompressMultiBlock0 - - - - - -
// c_rarg0: int offset, when (multi_block == true)
//
address generate_sha1_implCompress(bool multi_block, const char *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);

address start = __ pc();
__ enter();

RegSet saved_regs = RegSet::range(x18, x27);
if (multi_block) {
// use x9 as src below.
saved_regs += RegSet::of(x9);
}
__ push_reg(saved_regs, sp);

// c_rarg0 - c_rarg3: x10 - x13
Register buf = c_rarg0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could copy the four arguments to a different set of registers and use a0 -> a3 for some of the other values to see if you can increase the number of compressed instructions that can be used. Unclear whether it's worth it or not.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeh, I'm not sure if we should take this approach.
Good side might be some code size reduction, bad side might be it's a bit confusing to read and maintain the code.

Register state = c_rarg1;
Register offset = c_rarg2;
Register limit = c_rarg3;
// use src to contain the original start point of the array.
Register src = x9;

if (multi_block) {
__ sub(limit, limit, offset);
__ add(limit, limit, buf);
__ sub(src, buf, offset);
}
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved

// [args-reg]: x14 - x17
// [temp-reg]: x28 - x31
// [saved-reg]: x18 - x27

// h0/1/2/3/4
const Register a = x14, b = x15, c = x16, d = x17, e = x28;
// w0, w1, ... w15
// put two adjecent w's in one register:
// one at high word part, another at low word part
// at different round (even or odd), w't value resdie in different items in ws[].
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
// w0 ~ w15, either reside in
// ws[0] ~ ws[7], where
// w0 at higher 32 bits of ws[0],
// w1 at lower 32 bits of ws[0],
// ...
// w14 at higher 32 bits of ws[7],
// w15 at lower 32 bits of ws[7].
// or, reside in
// w0: ws[0]'s lower 32 bits
// w1 ~ w14: ws[1] ~ ws[7]
// w15: ws[8]'s higher 32 bits
Register ws[9] = {x29, x30, x31, x18,
x19, x20, x21, x22,
x23}; // auxiliary register for calculating w's value
// current k't's value
const Register cur_k = x24;
// current w't's value
const Register cur_w = x25;
// values of a, b, c, d, e in the previous round
const Register prev_ab = x26, prev_cd = x27;
const Register prev_e = offset; // reuse offset/c_rarg2

// load 5 words state into a, b, c, d, e.
//
// To minimize the number of memory operations, we apply following
// optimization: read the states (a/b/c/d) of 4-byte values in pairs,
// with a single ld, and split them into 2 registers.
//
// And, as the core algorithm of SHA-1 works on 32-bits words, so
// in the following code, it does not care about the content of
// higher 32-bits in a/b/c/d/e. Based on this observation,
// we can apply further optimization, which is to just ignore the
// higher 32-bits in a/c/e, rather than set the higher
// 32-bits of a/c/e to zero explicitly with extra instructions.
__ ld(a, Address(state, 0));
__ srli(b, a, 32);
__ ld(c, Address(state, 8));
__ srli(d, c, 32);
__ lw(e, Address(state, 16));

Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
Label L_sha1_loop;
if (multi_block) {
__ BIND(L_sha1_loop);
}

sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);

for (int round = 0; round < 80; round++) {
// prepare K't value
sha1_prepare_k(round, cur_k);

// prepare W't value
sha1_prepare_w(round, cur_w, ws, buf);

// one round process
sha1_process_round(round, a, b, c, d, e, cur_k, cur_w, t2);
}

// compute the intermediate hash value
sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);

if (multi_block) {
__ bge(limit, buf, L_sha1_loop, true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A small question: Is it OK to continue the loop when limit equals buf?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the answer to your question is at https://github.com/openjdk/jdk/blob/master/src/java.base/share/classes/sun/security/provider/DigestBase.java#L130, the limit is indeed limit - blockSize, hope this answer your question?

}

const Register mask32 = t2;
__ mv(mask32, 0xffffffff);
// store back the state.
__ andr(a, a, mask32);
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
__ slli(b, b, 32);
__ orr(a, a, b);
__ sd(a, Address(state, 0));
__ andr(c, c, mask32);
Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
__ slli(d, d, 32);
__ orr(c, c, d);
__ sd(c, Address(state, 8));
__ sw(e, Address(state, 16));

Hamlin-Li marked this conversation as resolved.
Show resolved Hide resolved
// return offset
if (multi_block) {
__ sub(c_rarg0, buf, src);
}

__ pop_reg(saved_regs, sp);

__ leave();
__ ret();

return (address) start;
}



#ifdef COMPILER2

static const int64_t right_2_bits = right_n_bits(2);
Expand Down Expand Up @@ -5273,6 +5615,11 @@ static const int64_t right_3_bits = right_n_bits(3);
StubRoutines::_chacha20Block = generate_chacha20Block();
}

if (UseSHA1Intrinsics) {
StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
}

#endif // COMPILER2_OR_JVMCI
}

Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/cpu/riscv/stubRoutines_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ enum platform_dependent_constants {
// simply increase sizes if too small (assembler will crash if too small)
_initial_stubs_code_size = 10000,
_continuation_stubs_code_size = 2000,
_compiler_stubs_code_size = 15000 ZGC_ONLY(+5000),
_compiler_stubs_code_size = 25000 ZGC_ONLY(+5000),
_final_stubs_code_size = 20000 ZGC_ONLY(+10000)
};

Expand Down
Loading