Description
openedon May 14, 2024
For some reason, the __addkf3
symbol linked by rustc says that 0x00000000000000000000000000000000 + 0x00000000000000000000000000000001 = 0x00000000000000000000000000000000
, when the answer should be 0x00000000000000000000000000000001
. Other symbols likely do the wrong thing here too.
Example:
$ powerpc64-linux-gnu-gcc add_test.c -o add_test.c.ppc64
$ rustc add_test.rs -o add_test.rs.ppc64 --target powerpc64-unknown-linux-gnu -Clinker=powerpc64-linux-gnu-gcc
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64
0000000000000000000000000000000000 0.000000
0000000000000000000000000000000001 0.000000
0000000000000000000000000000000001 0.000000
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64
[add_test.rs:12:5] a = 0x00000000000000000000000000000000
[add_test.rs:12:5] b = 0x00000000000000000000000000000001
[add_test.rs:14:5] c = 0x00000000000000000000000000000000
Our Rust code emits __addkf3
which calls the hardware f128 addition routine xsaddup
. The part I cannot explain is that the C version also emits __addkf3
which also calls xsaddup
, but somehow produces a correct result. I checked with GDB to make sure they are both hitting xsaddup
, so can't really explain the discrepancy.
Original notes at rust-lang/compiler-builtins#606 (comment), discussion on Zulip at https://rust-lang.zulipchat.com/#narrow/stream/122651-general/topic/f128.20system.20libraries.20noncompliant.20platforms/near/438486364.
Full code copied from rust-lang/compiler-builtins#606 (comment):
Rust code
#![feature(f128)]
#[no_mangle]
#[inline(never)]
fn add_entry(a: f128, b: f128) -> f128 {
a + b
}
fn main() {
let a = f128::from_bits(0x0);
let b = f128::from_bits(0x1);
dbg!(a, b);
let c = add_entry(a, b);
dbg!(c);
}
C code
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#define _Float128 __float128
typedef struct {
#if __BYTE_ORDER == __LITTLE_ENDIAN
uint64_t lower, upper;
#elif __BYTE_ORDER == __BIG_ENDIAN
uint64_t upper, lower;
#else
#error missing endian check
#endif
} __attribute__((aligned(_Alignof(_Float128)))) u128;
_Float128 __addkf3(_Float128, _Float128);
void f128_print(_Float128 val) {
u128 ival = *((u128 *)(&val));
printf("%#018" PRIx64 "%016" PRIx64 " %lf\n", ival.upper, ival.lower, (double)val);
}
_Float128 new_f128(uint64_t upper, uint64_t lower) {
u128 val = { .lower = lower, .upper = upper };
return *((_Float128 *)(&val));
}
_Float128 add_entry(_Float128 a, _Float128 b) {
#ifdef USE_ADDKF3
return __addkf3(a, b);
#else
return a + b;
#endif
}
int main() {
_Float128 a = new_f128(0x0000000000000000, 0x0000000000000000);
_Float128 b = new_f128(0x0000000000000000, 0x0000000000000001);
f128_print(a);
f128_print(b);
_Float128 c = add_entry(a, b);
f128_print(c);
return 0;
}
Assembly generated from Rust (incorrect result)
0000000000010d00 <.add_entry>:
10d00: 7c 08 02 a6 mflr r0
10d04: f8 21 ff 91 stdu r1,-112(r1)
10d08: f8 01 00 80 std r0,128(r1)
10d0c: 4b ff c6 95 bl d3a0 <00000143.plt_call.__addkf3>
10d10: e8 41 00 28 ld r2,40(r1)
10d14: 38 21 00 70 addi r1,r1,112
10d18: e8 01 00 10 ld r0,16(r1)
10d1c: 7c 08 03 a6 mtlr r0
10d20: 4e 80 00 20 blr
000000000000d3a0 <00000143.plt_call.__addkf3>:
d3a0: f8 41 00 28 std r2,40(r1)
d3a4: 3d 62 ff ff addis r11,r2,-1
d3a8: e9 8b 7f 58 ld r12,32600(r11)
d3ac: 7d 89 03 a6 mtctr r12
d3b0: e8 4b 7f 60 ld r2,32608(r11)
d3b4: 4e 80 04 20 bctr
0000000000056790 <.__addkf3_resolve>:
56790: 81 2d 8f 9c lwz r9,-28772(r13)
56794: 75 29 00 40 andis. r9,r9,64
56798: 41 82 00 18 beq 567b0 <.__addkf3_resolve+0x20>
5679c: e8 62 80 10 ld r3,-32752(r2)
567a0: 4e 80 00 20 blr
567a4: 60 00 00 00 nop
567a8: 60 00 00 00 nop
567ac: 60 42 00 00 ori r2,r2,0
567b0: e8 62 80 18 ld r3,-32744(r2)
567b4: 4e 80 00 20 blr
...
567c4: 60 00 00 00 nop
567c8: 60 00 00 00 nop
567cc: 60 42 00 00 ori r2,r2,0
000000000005e6e0 <.__addkf3_hw>:
5e6e0: fc 42 18 08 xsaddqp v2,v2,v3
5e6e4: 4e 80 00 20 blr
...
5e6f4: 60 00 00 00 nop
5e6f8: 60 00 00 00 nop
5e6fc: 60 00 00 00 nop
0000000000057050 <.__addkf3_sw>:
57050: fb 41 ff d0 std r26,-48(r1)
57054: fb 61 ff d8 std r27,-40(r1)
57058: fb 81 ff e0 std r28,-32(r1)
5705c: fb a1 ff e8 std r29,-24(r1)
57060: fb c1 ff f0 std r30,-16(r1)
57064: fb e1 ff f8 std r31,-8(r1)
57068: f8 21 ff 41 stdu r1,-192(r1)
5706c: 39 21 00 70 addi r9,r1,112
57070: 39 41 00 70 addi r10,r1,112
// ... full sw implementation
Assembly generated from C (correct result)
0000000010000af8 <.add_entry>:
10000af8: 7c 08 02 a6 mflr r0
10000afc: f8 01 00 10 std r0,16(r1)
10000b00: fb e1 ff f8 std r31,-8(r1)
10000b04: f8 21 ff 81 stdu r1,-128(r1)
10000b08: 7c 3f 0b 78 mr r31,r1
10000b0c: 39 20 00 30 li r9,48
10000b10: 39 5f 00 80 addi r10,r31,128
10000b14: 7c 4a 4f 99 stxvd2x vs34,r10,r9
10000b18: 39 20 00 40 li r9,64
10000b1c: 39 5f 00 80 addi r10,r31,128
10000b20: 7c 6a 4f 99 stxvd2x vs35,r10,r9
10000b24: 39 20 00 40 li r9,64
10000b28: 39 5f 00 80 addi r10,r31,128
10000b2c: 7c 6a 4e 99 lxvd2x vs35,r10,r9
10000b30: 39 20 00 30 li r9,48
10000b34: 39 5f 00 80 addi r10,r31,128
10000b38: 7c 4a 4e 99 lxvd2x vs34,r10,r9
10000b3c: 4b ff fb a5 bl 100006e0 <00000019.plt_call.__addkf3>
10000b40: e8 41 00 28 ld r2,40(r1)
10000b44: f0 02 14 96 xxmr vs0,vs34
10000b48: f0 40 04 91 xxmr vs34,vs0
10000b4c: 38 3f 00 80 addi r1,r31,128
10000b50: e8 01 00 10 ld r0,16(r1)
10000b54: 7c 08 03 a6 mtlr r0
10000b58: eb e1 ff f8 ld r31,-8(r1)
10000b5c: 4e 80 00 20 blr
10000b60: 00 00 00 00 .long 0x0
10000b64: 00 00 00 01 .long 0x1
10000b68: 80 01 00 01 lwz r0,1(r1)
0000000010000c40 <.__addkf3_resolve>:
10000c40: 81 2d 8f 9c lwz r9,-28772(r13)
10000c44: 75 29 00 40 andis. r9,r9,64
10000c48: 41 82 00 18 beq 10000c60 <.__addkf3_resolve+0x20>
10000c4c: e8 62 80 10 ld r3,-32752(r2)
10000c50: 4e 80 00 20 blr
10000c54: 60 00 00 00 nop
10000c58: 60 00 00 00 nop
10000c5c: 60 42 00 00 ori r2,r2,0
10000c60: e8 62 80 18 ld r3,-32744(r2)
10000c64: 4e 80 00 20 blr
...
10000c74: 60 00 00 00 nop
10000c78: 60 00 00 00 nop
10000c7c: 60 42 00 00 ori r2,r2,0
0000000010008b90 <.__addkf3_hw>:
10008b90: fc 42 18 08 xsaddqp v2,v2,v3
10008b94: 4e 80 00 20 blr
...
10008ba4: 60 00 00 00 nop
10008ba8: 60 00 00 00 nop
10008bac: 60 00 00 00 nop
0000000010001500 <.__addkf3_sw>:
10001500: fb 41 ff d0 std r26,-48(r1)
10001504: fb 61 ff d8 std r27,-40(r1)
10001508: fb 81 ff e0 std r28,-32(r1)
1000150c: fb a1 ff e8 std r29,-24(r1)
10001510: fb c1 ff f0 std r30,-16(r1)
10001514: fb e1 ff f8 std r31,-8(r1)
10001518: f8 21 ff 41 stdu r1,-192(r1)
1000151c: 39 21 00 70 addi r9,r1,112
10001520: 39 41 00 70 addi r10,r1,112
// ...