Skip to content

Commit ca6eaaa

Browse files
mcd500palmer-dabbelt
authored andcommitted
riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall
This patch will reduce cpu usage dramatically in kernel space especially for application which use sys-call with large buffer size, such as network applications. The main reason behind this is that every unaligned memory access will raise exceptions and switch between s-mode and m-mode causing large overhead. First copy in bytes until reaches the first word aligned boundary in destination memory address. This is the preparation before the bulk aligned word copy. The destination address is aligned now, but oftentimes the source address is not in an aligned boundary. To reduce the unaligned memory access, it reads the data from source in aligned boundaries, which will cause the data to have an offset, and then combines the data in the next iteration by fixing offset with shifting before writing to destination. The majority of the improving copy speed comes from this shift copy. In the lucky situation that the both source and destination address are on the aligned boundary, perform load and store with register size to copy the data. Without the unrolling, it will reduce the speed since the next store instruction for the same register using from the load will stall the pipeline. At last, copying the remainder in one byte at a time. Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com> Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
1 parent 31da94c commit ca6eaaa

File tree

1 file changed

+146
-35
lines changed

1 file changed

+146
-35
lines changed

arch/riscv/lib/uaccess.S

Lines changed: 146 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,50 +19,161 @@ ENTRY(__asm_copy_from_user)
1919
li t6, SR_SUM
2020
csrs CSR_STATUS, t6
2121

22-
add a3, a1, a2
23-
/* Use word-oriented copy only if low-order bits match */
24-
andi t0, a0, SZREG-1
25-
andi t1, a1, SZREG-1
26-
bne t0, t1, 2f
22+
/* Save for return value */
23+
mv t5, a2
2724

28-
addi t0, a1, SZREG-1
29-
andi t1, a3, ~(SZREG-1)
30-
andi t0, t0, ~(SZREG-1)
3125
/*
32-
* a3: terminal address of source region
33-
* t0: lowest XLEN-aligned address in source
34-
* t1: highest XLEN-aligned address in source
26+
* Register allocation for code below:
27+
* a0 - start of uncopied dst
28+
* a1 - start of uncopied src
29+
* a2 - size
30+
* t0 - end of uncopied dst
3531
*/
36-
bgeu t0, t1, 2f
37-
bltu a1, t0, 4f
32+
add t0, a0, a2
33+
bgtu a0, t0, 5f
34+
35+
/*
36+
* Use byte copy only if too small.
37+
*/
38+
li a3, 8*SZREG /* size must be larger than size in word_copy */
39+
bltu a2, a3, .Lbyte_copy_tail
40+
41+
/*
42+
* Copy first bytes until dst is align to word boundary.
43+
* a0 - start of dst
44+
* t1 - start of aligned dst
45+
*/
46+
addi t1, a0, SZREG-1
47+
andi t1, t1, ~(SZREG-1)
48+
/* dst is already aligned, skip */
49+
beq a0, t1, .Lskip_first_bytes
3850
1:
39-
fixup REG_L, t2, (a1), 10f
40-
fixup REG_S, t2, (a0), 10f
41-
addi a1, a1, SZREG
42-
addi a0, a0, SZREG
43-
bltu a1, t1, 1b
51+
/* a5 - one byte for copying data */
52+
fixup lb a5, 0(a1), 10f
53+
addi a1, a1, 1 /* src */
54+
fixup sb a5, 0(a0), 10f
55+
addi a0, a0, 1 /* dst */
56+
bltu a0, t1, 1b /* t1 - start of aligned dst */
57+
58+
.Lskip_first_bytes:
59+
/*
60+
* Now dst is aligned.
61+
* Use shift-copy if src is misaligned.
62+
* Use word-copy if both src and dst are aligned because
63+
* can not use shift-copy which do not require shifting
64+
*/
65+
/* a1 - start of src */
66+
andi a3, a1, SZREG-1
67+
bnez a3, .Lshift_copy
68+
69+
.Lword_copy:
70+
/*
71+
* Both src and dst are aligned, unrolled word copy
72+
*
73+
* a0 - start of aligned dst
74+
* a1 - start of aligned src
75+
* a3 - a1 & mask:(SZREG-1)
76+
* t0 - end of aligned dst
77+
*/
78+
addi t0, t0, -(8*SZREG-1) /* not to over run */
4479
2:
45-
bltu a1, a3, 5f
80+
fixup REG_L a4, 0(a1), 10f
81+
fixup REG_L a5, SZREG(a1), 10f
82+
fixup REG_L a6, 2*SZREG(a1), 10f
83+
fixup REG_L a7, 3*SZREG(a1), 10f
84+
fixup REG_L t1, 4*SZREG(a1), 10f
85+
fixup REG_L t2, 5*SZREG(a1), 10f
86+
fixup REG_L t3, 6*SZREG(a1), 10f
87+
fixup REG_L t4, 7*SZREG(a1), 10f
88+
fixup REG_S a4, 0(a0), 10f
89+
fixup REG_S a5, SZREG(a0), 10f
90+
fixup REG_S a6, 2*SZREG(a0), 10f
91+
fixup REG_S a7, 3*SZREG(a0), 10f
92+
fixup REG_S t1, 4*SZREG(a0), 10f
93+
fixup REG_S t2, 5*SZREG(a0), 10f
94+
fixup REG_S t3, 6*SZREG(a0), 10f
95+
fixup REG_S t4, 7*SZREG(a0), 10f
96+
addi a0, a0, 8*SZREG
97+
addi a1, a1, 8*SZREG
98+
bltu a0, t0, 2b
99+
100+
addi t0, t0, 8*SZREG-1 /* revert to original value */
101+
j .Lbyte_copy_tail
102+
103+
.Lshift_copy:
104+
105+
/*
106+
* Word copy with shifting.
107+
* For misaligned copy we still perform aligned word copy, but
108+
* we need to use the value fetched from the previous iteration and
109+
* do some shifts.
110+
* This is safe because reading less than a word size.
111+
*
112+
* a0 - start of aligned dst
113+
* a1 - start of src
114+
* a3 - a1 & mask:(SZREG-1)
115+
* t0 - end of uncopied dst
116+
* t1 - end of aligned dst
117+
*/
118+
/* calculating aligned word boundary for dst */
119+
andi t1, t0, ~(SZREG-1)
120+
/* Converting unaligned src to aligned arc */
121+
andi a1, a1, ~(SZREG-1)
122+
123+
/*
124+
* Calculate shifts
125+
* t3 - prev shift
126+
* t4 - current shift
127+
*/
128+
slli t3, a3, LGREG
129+
li a5, SZREG*8
130+
sub t4, a5, t3
131+
132+
/* Load the first word to combine with seceond word */
133+
fixup REG_L a5, 0(a1), 10f
46134

47135
3:
136+
/* Main shifting copy
137+
*
138+
* a0 - start of aligned dst
139+
* a1 - start of aligned src
140+
* t1 - end of aligned dst
141+
*/
142+
143+
/* At least one iteration will be executed */
144+
srl a4, a5, t3
145+
fixup REG_L a5, SZREG(a1), 10f
146+
addi a1, a1, SZREG
147+
sll a2, a5, t4
148+
or a2, a2, a4
149+
fixup REG_S a2, 0(a0), 10f
150+
addi a0, a0, SZREG
151+
bltu a0, t1, 3b
152+
153+
/* Revert src to original unaligned value */
154+
add a1, a1, a3
155+
156+
.Lbyte_copy_tail:
157+
/*
158+
* Byte copy anything left.
159+
*
160+
* a0 - start of remaining dst
161+
* a1 - start of remaining src
162+
* t0 - end of remaining dst
163+
*/
164+
bgeu a0, t0, 5f
165+
4:
166+
fixup lb a5, 0(a1), 10f
167+
addi a1, a1, 1 /* src */
168+
fixup sb a5, 0(a0), 10f
169+
addi a0, a0, 1 /* dst */
170+
bltu a0, t0, 4b /* t0 - end of dst */
171+
172+
5:
48173
/* Disable access to user memory */
49174
csrc CSR_STATUS, t6
50-
li a0, 0
175+
li a0, 0
51176
ret
52-
4: /* Edge case: unalignment */
53-
fixup lbu, t2, (a1), 10f
54-
fixup sb, t2, (a0), 10f
55-
addi a1, a1, 1
56-
addi a0, a0, 1
57-
bltu a1, t0, 4b
58-
j 1b
59-
5: /* Edge case: remainder */
60-
fixup lbu, t2, (a1), 10f
61-
fixup sb, t2, (a0), 10f
62-
addi a1, a1, 1
63-
addi a0, a0, 1
64-
bltu a1, a3, 5b
65-
j 3b
66177
ENDPROC(__asm_copy_to_user)
67178
ENDPROC(__asm_copy_from_user)
68179
EXPORT_SYMBOL(__asm_copy_to_user)
@@ -117,7 +228,7 @@ EXPORT_SYMBOL(__clear_user)
117228
10:
118229
/* Disable access to user memory */
119230
csrs CSR_STATUS, t6
120-
mv a0, a2
231+
mv a0, t5
121232
ret
122233
11:
123234
csrs CSR_STATUS, t6

0 commit comments

Comments
 (0)