Skip to content

Commit 3b7bc94

Browse files
committed
optimize llmulu_b and fix clock cycle count for lmulu_b
1 parent d663153 commit 3b7bc94

File tree

5 files changed

+78
-31
lines changed

5 files changed

+78
-31
lines changed

src/crt/llmulu_b.src

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,75 @@
11
assume adl=1
22

33
section .text
4+
45
public __llmulu_b
6+
57
__llmulu_b:
6-
push af
8+
; Multiplies BC:UDE:UHL by (SP) and returns the 64-bit unsigned product bc:ude:uhl.
9+
; I: (SP) = 8-bit multiplier, BC:UDE:UHL = multiplicand, ADL=1
10+
; O: bc:ude:uhl = BC:UDE:UHL * (SP)
11+
; CC: 101*r(PC)+21*r(SPL)+18*w(SPL)+33
12+
; CC: 100 bytes | 101F + 21R + 18W + 33
713
push iy
814
ld iy, 0
915
add iy, sp
1016
push de
1117
push hl
12-
ld a, (iy + 9)
13-
ld h, a
18+
push bc
19+
ld b, (iy + 6)
20+
ld c, 0
21+
ld h, b
1422
mlt hl
1523
ld (iy - 6), l
16-
ld d, a
24+
ld d, b
1725
ld e, (iy - 5)
1826
mlt de
1927
ld l, h
20-
ld h, 0
28+
ld h, c
2129
add hl, de
2230
ld (iy - 5), l
23-
ld d, a
31+
ld d, b
2432
ld e, (iy - 4)
2533
mlt de
2634
ld l, h
27-
ld h, 0
35+
ld h, c
2836
add hl, de
2937
ld (iy - 4), l
30-
ld d, a
38+
ld d, b
3139
ld e, (iy - 3)
3240
mlt de
3341
ld l, h
34-
ld h, 0
42+
ld h, c
3543
add hl, de
3644
ld (iy - 3), l
37-
ld d, a
45+
ld d, b
3846
ld e, (iy - 2)
3947
mlt de
4048
ld l, h
41-
ld h, 0
49+
ld h, c
4250
add hl, de
4351
ld (iy - 2), l
44-
ld d, a
52+
ld d, b
4553
ld e, (iy - 1)
4654
mlt de
4755
ld l, h
48-
ld h, 0
56+
ld h, c
4957
add hl, de
5058
ld (iy - 1), l
51-
ld d, a
52-
ld e, c
53-
mlt de
59+
60+
pop de
5461
ld l, h
55-
ld h, 0
62+
ld c, d
63+
ld d, b
64+
mlt bc
65+
ld h, c
66+
mlt de
67+
5668
add hl, de
69+
ld b, h
5770
ld c, l
58-
ld d, a
59-
ld e, b
60-
mlt de
61-
ld a, h
62-
add a, e
63-
ld b, a
71+
6472
pop hl
6573
pop de
6674
pop iy
67-
pop af
6875
ret

src/crt/lmulu_b.src

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ __lmulu_b:
99
; Multiplies EUHL by A and returns the 32-bit product euhl.
1010
; I: A=multiplier, EUHL=multiplicand, ADL=1
1111
; O: euhl=EUHL*A
12-
; CC: 43*r(PC)+12*r(SPL)+9*w(SPL)+13
13-
; CC: 42 bytes | 43F + 12R + 9W + 13
12+
; CC: 43*r(PC)+12*r(SPL)+9*w(SPL)+17
13+
; CC: 42 bytes | 43F + 12R + 9W + 17
1414
Mul_EUHL_A_EUHL:
1515
push bc
1616
push de
@@ -45,7 +45,7 @@ Mul_EUHL_A_EUHL:
4545
add hl, hl
4646
add hl, hl
4747
add hl, hl
48-
48+
4949
mlt de ; DE = A * L
5050
add hl, de ; UHL = AH.hi + AU.lo, AH.lo + AL.hi, AL.lo
5151

src/crt/lmulu_b_fast.src

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ __lmulu_b_fast:
99
; Multiplies EUHL by A and returns the 32-bit product euhl.
1010
; I: A=multiplier, EUHL=multiplicand, ADL=1
1111
; O: euhl=EUHL*A
12-
; CC: 37*r(PC)+6*r(SPL)+3*w(SPL)+13
13-
; CC: 36 bytes | 37F + 6R + 3W + 13
12+
; CC: 37*r(PC)+6*r(SPL)+3*w(SPL)+17
13+
; CC: 36 bytes | 37F + 6R + 3W + 17
1414
Mul_EUHL_A_EUHL:
1515
dec sp
1616
push hl
@@ -42,7 +42,7 @@ Mul_EUHL_A_EUHL:
4242
add hl, hl
4343
add hl, hl
4444
add hl, hl
45-
45+
4646
mlt de ; DE = A * L
4747
add hl, de ; UHL = AH.hi + AU.lo, AH.lo + AL.hi, AL.lo
4848

test/standalone/mulu_b/src/crt_wrap.asm

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,26 @@ _CRT_lmulu_b_fast:
9090
ld a, (iy + 9)
9191
jp __lmulu_b_fast
9292

93+
public _CRT_llmulu_b
94+
_CRT_llmulu_b:
95+
ld iy, 0
96+
add iy, sp
97+
ld l, (iy + 12)
98+
push hl
99+
ld hl, (iy + 3)
100+
ld de, (iy + 6)
101+
ld bc, (iy + 9)
102+
call __llmulu_b
103+
ld sp, iy
104+
ret
105+
93106
extern __smulu_b
94107
extern __smulu_b_fast
95-
108+
96109
extern __imulu_b
97110
extern __imulu_b_fast
98111

99112
extern __lmulu_b
100113
extern __lmulu_b_fast
114+
115+
extern __llmulu_b

test/standalone/mulu_b/src/main.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ uint24_t CRT_imulu_b_fast(uint24_t, uint8_t);
113113
uint32_t CRT_lmulu_b(uint32_t, uint8_t);
114114
uint32_t CRT_lmulu_b_fast(uint32_t, uint8_t);
115115

116+
uint64_t CRT_llmulu_b(uint64_t, uint8_t);
117+
116118
typedef struct reg_group {
117119
union {
118120
struct {
@@ -177,6 +179,14 @@ static bool test_A_UBC_UD(void) {
177179
return false;
178180
}
179181

182+
static bool test_A(void) {
183+
if (prev_reg.A == next_reg.A) {
184+
return true;
185+
}
186+
print_reg();
187+
return false;
188+
}
189+
180190
int test_smulu_b(void) {
181191
for (int i = 0; i < RANDOM_TEST_COUNT; i++) {
182192
uint16_t truth, guess, x;
@@ -258,6 +268,20 @@ int test_lmulu_b_fast(void) {
258268
return 0;
259269
}
260270

271+
int test_llmulu_b(void) {
272+
for (int i = 0; i < RANDOM_TEST_COUNT; i++) {
273+
uint64_t truth, guess, x;
274+
uint8_t y;
275+
x = rand64();
276+
y = rand8();
277+
truth = x * (uint64_t)y;
278+
guess = CRT_llmulu_b(x, y);
279+
CMP("%016llX", x, y, truth, guess);
280+
C((test_A()));
281+
}
282+
return 0;
283+
}
284+
261285
int run_tests(void) {
262286
srand(AUTOTEST_SEED);
263287
int ret = 0;
@@ -267,6 +291,7 @@ int run_tests(void) {
267291
TEST(test_imulu_b_fast());
268292
TEST(test_lmulu_b());
269293
TEST(test_lmulu_b_fast());
294+
TEST(test_llmulu_b());
270295

271296
return ret;
272297
}

0 commit comments

Comments
 (0)