Skip to content
This repository was archived by the owner on Dec 1, 2021. It is now read-only.
This repository was archived by the owner on Dec 1, 2021. It is now read-only.

Vectorized loop crashing? #24

@iwasaki-kenta

Description

@iwasaki-kenta

I tried compile the following C code w/ clang 7.0.0 (trunk 338352) w/ the following command:

clang -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -S count.c
int CountFilledEntries(char* entries, int len) {
  int result = 0;
  for (int i = 0; i < len; i++) {
    if (entries[i] != 0) result++;
  }

  return result;
}

Given the stub file:

//go:noescape
func _CountFilledEntries(entries unsafe.Pointer, len uint64) (count uint64)

func CountFilledEntries(entries []byte) uint {
	return uint(_CountFilledEntries(
		unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&entries)).Data),
		uint64(len(entries))),
	)
}

Running the test below causes the program to crash. Any possible insight as to why?

func TestCount(t *testing.T) {
	a := []byte{0, 1, 2, 9, 4, 0, 3, 0, 0}
	fmt.Println(CountFilledEntries(a))
}

Assembly:

//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT

TEXT ·_CountFilledEntries(SB), $0-24

    MOVQ entries+0(FP), DI
    MOVQ len+8(FP), SI

    WORD $0xf685                 // test    esi, esi
	JLE LBB0_1
    WORD $0xf189                 // mov    ecx, esi
    WORD $0xfe83; BYTE $0x07     // cmp    esi, 7
	JA LBB0_4
    WORD $0xd231                 // xor    edx, edx
    WORD $0xc031                 // xor    eax, eax
	JMP LBB0_11
LBB0_1:
    WORD $0xc031                 // xor    eax, eax
    MOVQ AX, count+16(FP)
    RET
LBB0_4:
    WORD $0xca89                 // mov    edx, ecx
    WORD $0xe283; BYTE $0xf8     // and    edx, -8
    LONG $0xf8728d48             // lea    rsi, [rdx - 8]
    WORD $0x8948; BYTE $0xf0     // mov    rax, rsi
    LONG $0x03e8c148             // shr    rax, 3
    LONG $0x01c08348             // add    rax, 1
    WORD $0x8941; BYTE $0xc0     // mov    r8d, eax
    LONG $0x01e08341             // and    r8d, 1
    WORD $0x8548; BYTE $0xf6     // test    rsi, rsi
	JE LBB0_5
    LONG $0x000001be; BYTE $0x00 // mov    esi, 1
    WORD $0x2948; BYTE $0xc6     // sub    rsi, rax
    WORD $0x014c; BYTE $0xc6     // add    rsi, r8
    LONG $0xffc68348             // add    rsi, -1
    LONG $0xd2ef0f66             // pxor    xmm2, xmm2
    WORD $0xc031                 // xor    eax, eax
    LONG $0xdb760f66             // pcmpeqd    xmm3, xmm3
    LONG $0xc0ef0f66             // pxor    xmm0, xmm0
    LONG $0xc9ef0f66             // pxor    xmm1, xmm1
LBB0_7:
    LONG $0x246e0f66; BYTE $0x07 // movd    xmm4, dword [rdi + rax]
    LONG $0xe2600f66             // punpcklbw    xmm4, xmm2
    LONG $0xe2610f66             // punpcklwd    xmm4, xmm2
    LONG $0x6c6e0f66; WORD $0x0407 // movd    xmm5, dword [rdi + rax + 4]
    LONG $0xea600f66             // punpcklbw    xmm5, xmm2
    LONG $0xea610f66             // punpcklwd    xmm5, xmm2
    LONG $0xe2760f66             // pcmpeqd    xmm4, xmm2
    LONG $0xe3ef0f66             // pxor    xmm4, xmm3
    LONG $0xc4fa0f66             // psubd    xmm0, xmm4
    LONG $0xea760f66             // pcmpeqd    xmm5, xmm2
    LONG $0xebef0f66             // pxor    xmm5, xmm3
    LONG $0xcdfa0f66             // psubd    xmm1, xmm5
    LONG $0x646e0f66; WORD $0x0807 // movd    xmm4, dword [rdi + rax + 8]
    LONG $0xe2600f66             // punpcklbw    xmm4, xmm2
    LONG $0xe2610f66             // punpcklwd    xmm4, xmm2
    LONG $0x6c6e0f66; WORD $0x0c07 // movd    xmm5, dword [rdi + rax + 12]
    LONG $0xea600f66             // punpcklbw    xmm5, xmm2
    LONG $0xea610f66             // punpcklwd    xmm5, xmm2
    LONG $0xe2760f66             // pcmpeqd    xmm4, xmm2
    LONG $0xe3ef0f66             // pxor    xmm4, xmm3
    LONG $0xc4fa0f66             // psubd    xmm0, xmm4
    LONG $0xea760f66             // pcmpeqd    xmm5, xmm2
    LONG $0xebef0f66             // pxor    xmm5, xmm3
    LONG $0xcdfa0f66             // psubd    xmm1, xmm5
    LONG $0x10c08348             // add    rax, 16
    LONG $0x02c68348             // add    rsi, 2
	JNE LBB0_7
    WORD $0x854d; BYTE $0xc0     // test    r8, r8
	JE LBB0_10
LBB0_9:
    LONG $0x546e0f66; WORD $0x0407 // movd    xmm2, dword [rdi + rax + 4]
    LONG $0xdbef0f66             // pxor    xmm3, xmm3
    LONG $0xd3600f66             // punpcklbw    xmm2, xmm3
    LONG $0xd3610f66             // punpcklwd    xmm2, xmm3
    LONG $0xd3760f66             // pcmpeqd    xmm2, xmm3
    LONG $0xe4760f66             // pcmpeqd    xmm4, xmm4
    LONG $0xd4ef0f66             // pxor    xmm2, xmm4
    LONG $0xcafa0f66             // psubd    xmm1, xmm2
    LONG $0x146e0f66; BYTE $0x07 // movd    xmm2, dword [rdi + rax]
    LONG $0xd3600f66             // punpcklbw    xmm2, xmm3
    LONG $0xd3610f66             // punpcklwd    xmm2, xmm3
    LONG $0xd3760f66             // pcmpeqd    xmm2, xmm3
    LONG $0xd4ef0f66             // pxor    xmm2, xmm4
    LONG $0xc2fa0f66             // psubd    xmm0, xmm2
LBB0_10:
    LONG $0xc1fe0f66             // paddd    xmm0, xmm1
    LONG $0xc8700f66; BYTE $0x4e // pshufd    xmm1, xmm0, 78
    LONG $0xc8fe0f66             // paddd    xmm1, xmm0
    LONG $0xc1700f66; BYTE $0xe5 // pshufd    xmm0, xmm1, 229
    LONG $0xc1fe0f66             // paddd    xmm0, xmm1
    LONG $0xc07e0f66             // movd    eax, xmm0
    WORD $0x3948; BYTE $0xca     // cmp    rdx, rcx
	JE LBB0_12
LBB0_11:
    LONG $0x01173c80             // cmp    byte [rdi + rdx], 1
    WORD $0xd883; BYTE $0xff     // sbb    eax, -1
    LONG $0x01c28348             // add    rdx, 1
    WORD $0x3948; BYTE $0xd1     // cmp    rcx, rdx
	JNE LBB0_11
LBB0_12:
    WORD $0x8948; BYTE $0xec     // mov    rsp, rbp
    BYTE $0x5d                   // pop    rbp
    BYTE $0xc3                   // ret
LBB0_5:
    LONG $0xc0ef0f66             // pxor    xmm0, xmm0
    WORD $0xc031                 // xor    eax, eax
    LONG $0xc9ef0f66             // pxor    xmm1, xmm1
    WORD $0x854d; BYTE $0xc0     // test    r8, r8
	JNE LBB0_9
	JMP LBB0_10

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions