-
Notifications
You must be signed in to change notification settings - Fork 813
/
Copy pathprim_dom_and_2share.sv
173 lines (148 loc) · 5.81 KB
/
prim_dom_and_2share.sv
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Copyright lowRISC contributors (OpenTitan project).
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Domain-Oriented Masking GF(2) Multiplier with 2-shares
// ref: Higher-Order Side-Channel Protected Implementations of Keccak
// https://eprint.iacr.org/2017/395.pdf
//
// q0 = a0 & b0 + (a0 & b1 + z)
// q1 = a1 & b1 + (a1 & b0 + z)
// () ==> registered
//
// all input should be stable for two clocks
// as the output is valid after a clock
// For z, it can use other slice from the state
// as it is fairly random w.r.t the current inputs.
// General formula of Q in the paper
// Qi = t{i,i} + Sig(j>i,d)(t{i,j}+Z{i+j*(j-1)/2}) + Sig(j<i,d)(t{i,j}+Z{j+i*(i-1)/2})
// for d=1 (NumShare 2 for first order protection)
// Q0 = t{0,0} + Sig(j>0,1)(t{0,j}+Z{j(j-1)/2}) + Sig(j<0,d)(..)
// = a0&b0 + (a0&b1 + z0 + 0)
// Q1 = t{1,1} + sig(j>1,1)(...) + sig(j<1,1)(t{1,j} + Z{j})
// = a1&b1 + (0 + a1&b0 + z0)
`include "prim_assert.sv"
module prim_dom_and_2share #(
parameter int DW = 64, // Input width
parameter bit Pipeline = 1'b0 // Enable full pipelining
) (
input clk_i,
input rst_ni,
input [DW-1:0] a0_i, // share0 of a
input [DW-1:0] a1_i, // share1 of a
input [DW-1:0] b0_i, // share0 of b
input [DW-1:0] b1_i, // share1 of b
input z_valid_i, // random number input validity
input [DW-1:0] z_i, // random number
output logic [DW-1:0] q0_o, // share0 of q
output logic [DW-1:0] q1_o, // share1 of q
output logic [DW-1:0] prd_o // pseudo-random data for other instances
);
logic [DW-1:0] t0_d, t0_q, t1_d, t1_q;
logic [DW-1:0] t_a0b0, t_a1b1;
logic [DW-1:0] t_a0b0_d, t_a1b1_d;
logic [DW-1:0] t_a0b1, t_a1b0;
/////////////////
// Calculation //
/////////////////
// Inner-domain terms
assign t_a0b0_d = a0_i & b0_i;
assign t_a1b1_d = a1_i & b1_i;
// Cross-domain terms
assign t_a0b1 = a0_i & b1_i;
assign t_a1b0 = a1_i & b0_i;
///////////////
// Resharing //
///////////////
// Resharing of cross-domain terms
// Preserve the logic sequence for XOR not to proceed cross-domain AND.
prim_xor2 #(
.Width ( DW*2 )
) u_prim_xor_t01 (
.in0_i ( {t_a0b1, t_a1b0} ),
.in1_i ( {z_i, z_i} ),
.out_o ( {t0_d, t1_d} )
);
// Register stage
prim_flop_en #(
.Width ( DW*2 ),
.ResetValue ( '0 )
) u_prim_flop_t01 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( z_valid_i ),
.d_i ( {t0_d, t1_d} ),
.q_o ( {t0_q, t1_q} )
);
/////////////////////////
// Optional Pipelining //
/////////////////////////
if (Pipeline == 1'b1) begin : gen_inner_domain_regs
// Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
// input data every clock cycle and prevents SCA leakage occurring due to the integration of
// reshared cross-domain terms with inner-domain terms derived from different input data.
logic [DW-1:0] t_a0b0_q, t_a1b1_q;
prim_flop_en #(
.Width ( DW*2 ),
.ResetValue ( '0 )
) u_prim_flop_tab01 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( z_valid_i ),
.d_i ( {t_a0b0_d, t_a1b1_d} ),
.q_o ( {t_a0b0_q, t_a1b1_q} )
);
assign t_a0b0 = t_a0b0_q;
assign t_a1b1 = t_a1b1_q;
end else begin : gen_no_inner_domain_regs
// Do not add the optional pipeline registers on the inner-domain terms. This allows to save
// some area in case the multiplier does not need to accept new data in every cycle. However,
// this can cause SCA leakage as during the clock cycle in which new data arrives, the new
// inner-domain terms are integrated with the previous, reshared cross-domain terms.
assign t_a0b0 = t_a0b0_d;
assign t_a1b1 = t_a1b1_d;
end
/////////////////
// Integration //
/////////////////
// Preserve the logic sequence for XOR not to proceed the inner-domain AND.
prim_xor2 #(
.Width ( DW*2 )
) u_prim_xor_q01 (
.in0_i ( {t_a0b0, t_a1b1} ),
.in1_i ( {t0_q, t1_q} ),
.out_o ( {q0_o, q1_o} )
);
// Use intermediate results for remasking computations in another instance in the following
// clock cycle. Use one share only. Directly use output of flops updating with z_valid_i.
// t1_q is obtained by remasking t_a1b0 with z_i. Since z_i is uniformly distributed and
// independent of a1/b0_i, t1_q is also uniformly distributed and independent of a1/b0_i.
// For details, see Lemma 1 in Canright, "A very compact 'perfectly masked' S-box for AES
// (corrected)" available at https://eprint.iacr.org/2009/011.pdf
assign prd_o = t1_q;
// DOM AND should be same as unmasked computation
// The correct test sequence will be:
// 1. inputs are changed
// 2. check if z_valid_i,
// 3. at the next cycle, inputs are still stable (assumption) - only in case Pipeline = 0
// 4. and results Q == A & B (assertion)
// To speed up the FPV process, random value is ready in less than or
// equal to two cycles.
`ASSUME_FPV(RandomReadyInShortTime_A,
$changed(a0_i) || $changed(a1_i) || $changed(b0_i) || $changed(b1_i)
|-> ##[0:2] z_valid_i,
clk_i, !rst_ni)
if (Pipeline == 0) begin: g_assert_stable
// If Pipeline is not set, the computation takes two cycles without flop
// crossing the domain. In this case, the signal should be stable for at
// least two cycles.
`ASSUME(StableTwoCycles_M,
($changed(a0_i) || $changed(a1_i) || $changed(b0_i) || $changed(b1_i))
##[0:$] z_valid_i |=>
$stable(a0_i) && $stable(a1_i) && $stable(b0_i) && $stable(b1_i))
end
`ASSERT(UnmaskedAndMatched_A,
z_valid_i |=> (q0_o ^ q1_o) ==
(($past(a0_i) ^ $past(a1_i)) & ($past(b0_i) ^ $past(b1_i))),
clk_i, !rst_ni)
endmodule