From 7202dfc08cf2b466f803ca6adaefbeb22726d50d Mon Sep 17 00:00:00 2001 From: Frank Wessels Date: Mon, 17 Jun 2024 01:10:21 -0700 Subject: [PATCH] Support for ARM SVE and better performance for NEON (#281) * Support for ARM SVE and better performance for NEON * Code refactoring to support code generation for both amd64 as well as arm64 --- README.md | 4 + _gen/gen-arm-neon.go | 419 + _gen/gen-arm-sve.go | 361 + _gen/gen.go | 53 +- _gen/go.mod | 4 +- _gen/go.sum | 4 + galois.go | 6 +- galois_amd64_test.go | 15 + galois_arm64_test.go | 19 + galois_gen_arm64.go | 125 + galois_gen_arm64.s | 26958 ++++++++++++++++++++++++++ galois_gen_none.go | 38 +- galois_gen_switch_amd64.go | 35 +- galois_gen_switch_arm64.go | 195 + galois_gen_switch_nopshufb_amd64.go | 31 +- galois_gen_switch_nopshufb_arm64.go | 22 + galois_notamd64.go | 13 - galois_test.go | 170 + options.go | 11 +- reedsolomon.go | 198 +- 20 files changed, 28506 insertions(+), 175 deletions(-) create mode 100644 _gen/gen-arm-neon.go create mode 100644 _gen/gen-arm-sve.go create mode 100644 galois_amd64_test.go create mode 100644 galois_arm64_test.go create mode 100644 galois_gen_arm64.go create mode 100644 galois_gen_arm64.s create mode 100644 galois_gen_switch_arm64.go create mode 100644 galois_gen_switch_nopshufb_arm64.go delete mode 100644 galois_notamd64.go diff --git a/README.md b/README.md index 8aa4ac7f..d94512a3 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,10 @@ Using Go modules is recommended. # Changes +## 2024 + + * Auto-generation of SVE and NEON routines for ARM based on AVX2 code. This results in a speedup of 2x for SVE (as measured using Graviton 3 on AWS) and a speedup of 1.5x as compared to the existing NEON-accelerated code. + ## 2022 * [GFNI](https://github.com/klauspost/reedsolomon/pull/224) support for amd64, for up to 3x faster processing. diff --git a/_gen/gen-arm-neon.go b/_gen/gen-arm-neon.go new file mode 100644 index 00000000..cebe1acd --- /dev/null +++ b/_gen/gen-arm-neon.go @@ -0,0 +1,419 @@ +// Copyright 2024, Klaus Post/Minio Inc. See LICENSE for details. + +package main + +import ( + "bytes" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" +) + +func convert2Neon(asmBuf *bytes.Buffer, lines []string) { + + asmF := func(format string, args ...interface{}) { + (*asmBuf).WriteString(fmt.Sprintf(format, args...)) + } + + reAddrMode := regexp.MustCompile(`\[(.*?)\]`) // regexp to match content between square brackets + + getZregister := func(reg string) int { + if reg[0] == 'z' { + reg = strings.NewReplacer(",", "", ".d", "", ".b", "", "[0]", "").Replace(reg[1:]) + num, err := strconv.Atoi(reg) + if err != nil { + panic(err) + } + return num + } + return -1 + } + + getXregister := func(reg string) int { + if reg[0] == 'x' { + reg = strings.ReplaceAll(reg, ",", "") + num, err := strconv.Atoi(reg[1:]) + if err != nil { + panic(err) + } + return num + } + return -1 + } + + getHashImm := func(imm string) int { + if imm[0] == '#' { + num, err := strconv.Atoi(imm[1:]) + if err != nil { + panic(err) + } + return num + } else { + panic("bad immediate") + } + } + + parseAddrModeMulVl := func(addrMode string) string { + addrMode = strings.NewReplacer("[", "", "]", "").Replace(addrMode) + f := strings.Fields(addrMode) + xn, offset := getXregister(f[0]), "" + if len(f) > 1 { + if len(f) == 4 && f[2] == "MUL" && f[3] == "VL" { + num, err := strconv.Atoi(strings.NewReplacer("#", "", ",", "").Replace(f[1])) + if err != nil { + panic(err) + } + offset = fmt.Sprintf("%d", num*32) + } else { + panic("bad addressing mode") + } + } + + return fmt.Sprintf("%s(R%d)", offset, xn) + } + + parseAddrModeIndexed := func(addrMode string) (string, string) { + addrMode = strings.NewReplacer("[", "", "]", "").Replace(addrMode) + f := strings.Fields(addrMode) + regbase := getXregister(f[0]) + regshifted := getXregister(f[1]) + shift := getHashImm(f[3]) + return fmt.Sprintf("(R%d)", regbase), fmt.Sprintf("R%d<<%d", regshifted, shift) + } + + for _, line := range lines { + if strings.Contains(line, " // ldr z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + var am string + { // HACK: ignore offset since we're fixating it at 32 + baseReg := strings.Split(matches[0][0], ",")[0] + am = parseAddrModeMulVl(baseReg) + } + asmF(" VLD1.P 32%s, [V%d.B16, V%d.B16]\n", am, zd*2, zd*2+1) + } else { + panic("bad 'ldr' instrunction") + } + } else if strings.Contains(line, " // str z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + var am string + { // HACK: ignore offset since we're fixating it at 32 + baseReg := strings.Split(matches[0][0], ",")[0] + am = parseAddrModeMulVl(baseReg) + } + asmF(" VST1.P [V%d.D2, V%d.D2], 32%s\n", zd*2, zd*2+1, am) + } else { + panic("bad 'str' instrunction") + } + } else if strings.Contains(line, " // ld1d { z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[5]) + base, shifted := parseAddrModeIndexed(matches[0][0]) + asmF(" ADD %s, %s\n", shifted, strings.NewReplacer("(", "", ")", "").Replace(base)) + asmF(" VLD1 %s, [V%d.B16, V%d.B16]\n", base, zd*2, zd*2+1) + } else { + panic("bad 'ld1d' instrunction") + } + } else if strings.Contains(line, " // st1d { z") { + if matches := reAddrMode.FindAllStringSubmatch(line, -1); len(matches) == 1 { + line = strings.ReplaceAll(line, matches[0][0], "ADDRMODE") + f := strings.Fields(line) + zd := getZregister(f[5]) + base, shifted := parseAddrModeIndexed(matches[0][0]) + asmF(" ADD %s, %s\n", shifted, strings.NewReplacer("(", "", ")", "").Replace(base)) + asmF(" VST1 [V%d.D2, V%d.D2], %s\n", zd*2, zd*2+1, base) + } else { + panic("bad 'st1d' instrunction") + } + } else if strings.Contains(line, " // lsr z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + imm := getHashImm(f[len(f)-1]) + asmF(" VUSHR $%d, V%d.B16, V%d.B16\n", imm, zn*2, zd*2) + asmF(" VUSHR $%d, V%d.B16, V%d.B16\n", imm, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // and z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + zn2 := getZregister(f[len(f)-1]) + asmF(" VAND V%d.B16, V%d.B16, V%d.B16\n", zn2*2, zn*2, zd*2) + asmF(" VAND V%d.B16, V%d.B16, V%d.B16\n", zn2*2 /*+1*/, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // tbl z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + zn2 := getZregister(f[len(f)-1]) + asmF(" VTBL V%d.B16, [V%d.B16], V%d.B16\n", zn2*2, zn*2, zd*2) + asmF(" VTBL V%d.B16, [V%d.B16], V%d.B16\n", zn2*2+1, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // eor z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-3]) + zn := getZregister(f[len(f)-2]) + zn2 := getZregister(f[len(f)-1]) + asmF(" VEOR V%d.B16, V%d.B16, V%d.B16\n", zn2*2, zn*2, zd*2) + asmF(" VEOR V%d.B16, V%d.B16, V%d.B16\n", zn2*2+1, zn*2+1, zd*2+1) + } else if strings.Contains(line, " // mov z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + xn := getXregister(f[len(f)-1]) + asmF(" VMOV R%d, V%d.B[0]\n", xn, zd*2) + } else if strings.Contains(line, " // dup z") { + f := strings.Fields(line) + zd := getZregister(f[len(f)-2]) + zn := getZregister(f[len(f)-1]) + asmF(" VDUP V%d.B[0], V%d.B16\n", zn*2, zd*2) + } else if strings.Contains(line, " // add x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-3]) + if xd != getXregister(f[len(f)-2]) { + panic("registers don't match") + } + if f[len(f)-1][0] == '#' { + imm := getHashImm(f[len(f)-1]) + asmF(" ADD $%d, R%d\n", imm, xd) + } else { + xn := getXregister(f[len(f)-1]) + asmF(" ADD R%d, R%d\n", xn, xd) + } + } else if strings.Contains(line, " // subs x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-3]) + if xd != getXregister(f[len(f)-2]) { + panic("registers don't match") + } + imm := getHashImm(f[len(f)-1]) + asmF(" SUBS $%d, R%d\n", imm, xd) + } else if strings.Contains(line, " // lsr x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-3]) + if xd != getXregister(f[len(f)-2]) { + panic("registers don't match") + } + imm := getHashImm(f[len(f)-1]) + asmF(" LSR $%d, R%d\n", imm, xd) + } else if strings.Contains(line, " // tst x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-2]) + xn := getXregister(f[len(f)-1]) + asmF(" TST R%d, R%d\n", xn, xd) + } else if strings.Contains(line, " // mov x") { + f := strings.Fields(line) + xd := getXregister(f[len(f)-2]) + imm := getHashImm(f[len(f)-1]) + asmF(" MOVD $%d, R%d\n", imm, xd) + } else if strings.HasSuffix(line, ":") || + strings.HasPrefix(line, " BEQ") || + strings.HasPrefix(line, " BNE") || + strings.HasPrefix(line, "TEXT ·mulSve") || + strings.HasPrefix(line, "// func mulSve") { + line = strings.ReplaceAll(line, "Sve", "Neon") + asmF("%s\n", line) + } else if strings.Contains(line, "Requires: SVE") { + line = strings.ReplaceAll(line, "SVE", "NEON") + asmF("%s\n", line) + } else if strings.Contains(line, " // ptrue p") { + // intentionally drop line + } else if strings.HasPrefix(line, " // ") || + strings.HasPrefix(line, " MOVD ") || + strings.HasPrefix(line, " CMP ") || + strings.HasPrefix(line, " RET") || + len(line) == 0 { + asmF("%s\n", line) + } else { + panic(fmt.Sprintf("convert2Neon unsupported: `%s`", line)) + } + } +} + +func fixPostIncrementNeon(asmBuf *bytes.Buffer, lines []string) { + + asmF := func(format string, args ...interface{}) { + (*asmBuf).WriteString(fmt.Sprintf(format, args...)) + } + + const MATRIX_BASE = "matrix_base" + + skipResetMatrixBase := false + { + routine := strings.Join(lines, "\n") + reFramePtr := regexp.MustCompile(`MOVD\s*` + MATRIX_BASE + `\+\d*(\(FP\),\s*R\d*)`) + + if matches := reFramePtr.FindAllStringSubmatch(routine, -1); len(matches) == 1 { + framePtrToDest := matches[0][1] + + // check if we're loading into register + // more than once from the stack frame + // (meaning we overwrite the 'matrix_base' value) + escaped := strings.NewReplacer("(", `\(`, ")", `\)`).Replace(framePtrToDest) + reSameDest := regexp.MustCompile(`MOVD\s*\w*\+\d*` + escaped) + if m := reSameDest.FindAllStringSubmatch(routine, -1); len(m) == 2 { + skipResetMatrixBase = true + } + } + } + + isXor := false + { + routine := strings.Join(lines, "\n") + isXor = strings.Count(routine, "Xor(SB)") > 0 + } + + resetMatrixBaseAtStartOfLoop := "" + for i := 0; i < len(lines); i++ { + + if !skipResetMatrixBase { + // + // Since we are loading with post-increment, + // reset register holding matrix array at + // start of each loop + // + if strings.Contains(lines[i], MATRIX_BASE) { + resetMatrixBaseAtStartOfLoop = lines[i] + continue + } else if strings.HasSuffix(lines[i], "_loop:") { + asmF("%s\n", lines[i]) + asmF("%s\n", resetMatrixBaseAtStartOfLoop) + resetMatrixBaseAtStartOfLoop = "" + continue + } + } + + // + // Remove the explicit ADDition of the + // pointer to the shard (since we are already + // using post-increments for the loads/stores) + // + if i < len(lines)-1 && + strings.Contains(lines[i], "32(R") && + strings.Contains(lines[i+1], "ADD") && strings.Contains(lines[i+1], "$32, R") { + + storing := strings.Contains(lines[i], "VST1.P") + if storing && isXor { + // move post-increment into a "pre-decrement" to offset + // post-increment for loading of existing content in case of Xor-case + asmF("%s\n", strings.ReplaceAll(lines[i+1], "ADD", "SUB")) + asmF("%s\n", lines[i]) + } else { + asmF("%s\n", lines[i]) + // intentionally skip line with ADD + } + i += 1 + continue + } + if i < len(lines)-2 && + strings.Contains(lines[i], "32(R") && + strings.Contains(lines[i+1], "32(R") && + strings.Contains(lines[i+2], "ADD") && strings.Contains(lines[i+2], "$64, R") { + + storing := strings.Contains(lines[i], "VST1.P") && strings.Contains(lines[i+1], "VST1.P") + if storing && isXor { + // move post-increment into a "pre-decrement" to offset + // post-increment for loading of existing content in case of Xor-case + asmF("%s\n", strings.ReplaceAll(lines[i+2], "ADD", "SUB")) + asmF("%s\n", lines[i]) + asmF("%s\n", lines[i+1]) + } else { + asmF("%s\n", lines[i]) + asmF("%s\n", lines[i+1]) + // intentionally skip line with ADD + } + i += 2 + continue + } + + asmF("%s\n", lines[i]) + } +} + +func genArmNeon() { + const SVE_CODE = "../galois_gen_arm64.s" + + asmOut, goOut := &bytes.Buffer{}, &bytes.Buffer{} + + if asmSve, err := os.ReadFile(SVE_CODE); err != nil { + log.Fatalf("Failed to read %s: %v", SVE_CODE, err) + } else { + // start with SVE code + asmOut.WriteString(string(asmSve)) + } + if goSve, err := os.ReadFile(strings.ReplaceAll(SVE_CODE, ".s", ".go")); err != nil { + log.Fatalf("Failed to read %s: %v", SVE_CODE, err) + } else { + goOut.WriteString(string(goSve)) + } + + const input = 10 + + // Processing 64 bytes variants + for output := 1; output <= 3; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulSve_%dx%d_64%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "Sve", "Neon")) + + lines, err := extractRoutine(SVE_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: NEON\n") + + { + asmTemp := &bytes.Buffer{} + convert2Neon(asmTemp, lines) + fixPostIncrementNeon(asmOut, strings.Split(string(asmTemp.Bytes()), "\n")) + } + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + + // Processing 32 bytes variants + for output := 4; output <= 10; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulSve_%dx%d%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "Sve", "Neon")) + + lines, err := extractRoutine(SVE_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: NEON\n") + + { + asmTemp := &bytes.Buffer{} + convert2Neon(asmTemp, lines) + fixPostIncrementNeon(asmOut, strings.Split(string(asmTemp.Bytes()), "\n")) + } + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n", funcDef)) + + if !(output == 10 && op == "Xor") { + goOut.WriteString("\n") + } + } + } + if err := os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } + if err := os.WriteFile("../galois_gen_arm64.go", goOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} diff --git a/_gen/gen-arm-sve.go b/_gen/gen-arm-sve.go new file mode 100644 index 00000000..64f37b45 --- /dev/null +++ b/_gen/gen-arm-sve.go @@ -0,0 +1,361 @@ +// Copyright 2024, Klaus Post/Minio Inc. See LICENSE for details. + +package main + +import ( + "bufio" + "bytes" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" + + avxtwo2sve "github.com/fwessels/avxTwo2sve" + sve_as "github.com/fwessels/sve-as" +) + +func patchLabel(line string) string { + return strings.ReplaceAll(line, "AvxTwo", "Sve") +} + +func extractRoutine(filename, routine string) (lines []string, err error) { + file, err := os.Open(filename) + if err != nil { + return + } + defer file.Close() + + // Create a scanner to read the file line by line + scanner := bufio.NewScanner(file) + + // Iterate over each line + collect := false + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, routine) { + collect = true + } + if collect { + lines = append(lines, line) + } + if collect && strings.HasSuffix(line, "RET") { + collect = false + } + } + + // Check for any errors that occurred during scanning + err = scanner.Err() + return +} + +func addArmInitializations(instructions []string) (processed []string) { + for _, instr := range instructions { + processed = append(processed, instr) + if strings.HasPrefix(instr, "TEXT ·") { + sve := "ptrue p0.d" + opcode, err := sve_as.Assemble(sve) + if err != nil { + processed = append(processed, fmt.Sprintf(" WORD $0x00000000 // %-44s\n", sve)) + } else { + processed = append(processed, fmt.Sprintf(" WORD $0x%08x // %-44s\n", opcode, sve)) + } + } + } + return +} + +// Expand #defines +func expandHashDefines(instructions []string) (processed []string) { + for _, instr := range instructions { + if strings.Contains(instr, "XOR3WAY") { + f := strings.Fields(instr) + if len(f) >= 3 { + dst := strings.ReplaceAll(f[len(f)-1], ")", "") + b := strings.ReplaceAll(f[len(f)-2], ",", "") + a := strings.ReplaceAll(f[len(f)-3], ",", "") + + processed = append(processed, fmt.Sprintf("VPXOR %s, %s, %s", a, dst, dst)) + processed = append(processed, fmt.Sprintf("VPXOR %s, %s, %s", b, dst, dst)) + } else { + log.Fatalf("Not enough arguments for 'XOR3WAY' macro: %d", len(f)) + } + } else if !strings.Contains(instr, "VZEROUPPER") { + processed = append(processed, instr) + } + } + return +} + +func convertRoutine(asmBuf *bytes.Buffer, instructions []string) { + + asmF := func(format string, args ...interface{}) { + (*asmBuf).WriteString(fmt.Sprintf(format, args...)) + } + + wordOpcode := regexp.MustCompile(`WORD \$0x[0-9a-f]{8}`) + + for _, instr := range instructions { + instr = strings.TrimSpace(instr) + if instr == "" { + asmF("\n") + } else if strings.HasPrefix(instr, "TEXT ") { // function header + asmF("%s\n", patchLabel(instr)) + } else if wordOpcode.MatchString(instr) { // arm code + asmF(" %s\n", instr) + } else if strings.HasPrefix(instr, "//") { // comment + asmF(" %s\n", instr) + } else if strings.HasSuffix(instr, ":") { // label + asmF("%s\n", patchLabel(instr)) + } else { + sve, plan9, err := avxtwo2sve.AvxTwo2Sve(instr, patchLabel) + if err != nil { + panic(err) + } else if !plan9 { + opcode, err := sve_as.Assemble(sve) + if err != nil { + asmF(" WORD $0x00000000 // %-44s\n", sve) + } else { + asmF(" WORD $0x%08x // %-44s\n", opcode, sve) + } + } else { + asmF(" %s\n", sve) + } + } + } +} + +// convert (R..*1) memory accesses into (R..*8) offsets +func patchScaledLoads(code string, outputs int, isXor bool) (patched []string) { + + scaledMemOps := strings.Count(code, "*1)") + if scaledMemOps == 0 { + // in case of no scaled loads, exit out early + return strings.Split(code, "\n") + } + + sanityCheck := outputs + if isXor { + sanityCheck *= 2 // need to load all values as well as store them + } + if scaledMemOps != sanityCheck { + panic("Couldn't find expected number of scaled memory ops") + } + + scaledReg := "" + re := regexp.MustCompile(`R(\d+)\*1`) + if match := re.FindStringSubmatch(code); len(match) > 1 { + scaledReg = fmt.Sprintf("R%s", match[1]) + } else { + panic("Failed to find register used for scaled memory ops") + } + + const inputs = 10 + + scaledRegUses := strings.Count(code, scaledReg) + sanityCheck += inputs // needed to add start offset to input + sanityCheck += 1 // needed to load offset from stack + sanityCheck += 1 // needed to increment offset + + if scaledRegUses != sanityCheck { + panic("Did not find expected number of uses of scaled register") + } + + // Adjust all scaled loads + code = strings.ReplaceAll(code, fmt.Sprintf("(%s*1)", scaledReg), fmt.Sprintf("(%s*8)", scaledReg)) + + // Adjust increment at end of loop + reAdd := regexp.MustCompile(`ADDQ\s*\$(0x[0-9a-f]+),\s*` + scaledReg) + if match := reAdd.FindStringSubmatch(code); len(match) > 1 && match[1][:2] == "0x" { + if increment, err := strconv.ParseInt(match[1][2:], 16, 64); err == nil { + code = strings.ReplaceAll(code, fmt.Sprintf("0x%x, %s", increment, scaledReg), fmt.Sprintf("0x%02x, %s", increment>>3, scaledReg)) + } else { + panic(err) + } + } else { + panic("Failed to find increment of offset") + } + + // Add shift instruction during initialization after inputs have been adjusted + reShift := regexp.MustCompilePOSIX(fmt.Sprintf(`^[[:blank:]]+ADDQ[[:blank:]]+%s.*$`, scaledReg)) + if matches := reShift.FindAllStringIndex(code, -1); len(matches) == inputs { + lastInpIncr := code[matches[inputs-1][0]:matches[inputs-1][1]] + shiftCorrection := strings.ReplaceAll(strings.Split(lastInpIncr, scaledReg)[0], "ADDQ", "SHRQ") + shiftCorrection += "$0x03, " + scaledReg + code = strings.ReplaceAll(code, lastInpIncr, lastInpIncr+"\n"+shiftCorrection) + } else { + fmt.Println(matches) + panic("Did not find expected number start offset corrections") + } + + return strings.Split(code, "\n") +} + +func fromAvx2ToSve() { + asmOut, goOut := &bytes.Buffer{}, &bytes.Buffer{} + + goOut.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT.` + "\n\n") + goOut.WriteString("//go:build !noasm && !appengine && !gccgo && !nopshufb\n\n") + goOut.WriteString("package reedsolomon\n\n") + + const input = 10 + const AVX2_CODE = "../galois_gen_amd64.s" + + // Processing 64 bytes variants + for output := 1; output <= 3; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulAvxTwo_%dx%d_64%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "AvxTwo", "Sve")) + + // asm first + lines, err := extractRoutine(AVX2_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + lines = patchScaledLoads(strings.Join(lines, "\n"), output, strings.HasSuffix(templName, "Xor")) + lines = expandHashDefines(lines) + + convertRoutine(asmOut, lines) + + // add newline after RET + asmOut.WriteString("\n") + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + + // Processing 32 bytes variants + for output := 4; output <= 10; output++ { + for op := ""; len(op) <= 3; op += "Xor" { + templName := fmt.Sprintf("mulAvxTwo_%dx%d%s", input, output, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", strings.ReplaceAll(templName, "AvxTwo", "Sve")) + + // asm first + lines, err := extractRoutine(AVX2_CODE, fmt.Sprintf("TEXT ·%s(SB)", templName)) + if err != nil { + log.Fatal(err) + } + lines = patchScaledLoads(strings.Join(lines, "\n"), output, strings.HasSuffix(templName, "Xor")) + lines = expandHashDefines(lines) + + // add additional initialization for SVE + // (for predicated loads and stores in + // case of register shortage) + lines = addArmInitializations(lines) + + convertRoutine(asmOut, lines) + + // add newline after RET + asmOut.WriteString("\n") + + // golang declaration + goOut.WriteString(fmt.Sprintf("//go:noescape\n%s\n\n", funcDef)) + } + } + + if err := os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } + if err := os.WriteFile("../galois_gen_arm64.go", goOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} + +func insertEarlyExit(lines []string, funcName string, outputs int) (processed []string) { + + const reg = "R16" + label := funcName + "_store" + + reComment := regexp.MustCompile(fmt.Sprintf(`// Load and process \d* bytes from input (\d*) to %d outputs`, outputs)) + reLoop := regexp.MustCompile(`^` + strings.ReplaceAll(label, "store", "loop") + `:`) + reStore := regexp.MustCompile(fmt.Sprintf(`// Store %d outputs`, outputs)) + + for _, line := range lines { + if matches := reLoop.FindAllStringSubmatch(line, -1); len(matches) == 1 { + lastline := processed[len(processed)-1] + processed = processed[:len(processed)-1] + processed = append(processed, "") + processed = append(processed, fmt.Sprintf(" // Load number of input shards")) + processed = append(processed, fmt.Sprintf(" MOVD in_len+32(FP), %s", reg)) + processed = append(processed, lastline) + } + + if matches := reComment.FindAllStringSubmatch(line, -1); len(matches) == 1 { + if inputs, err := strconv.Atoi(matches[0][1]); err != nil { + panic(err) + } else { + if inputs > 0 && inputs < 10 { + lastline := processed[len(processed)-1] + processed = processed[:len(processed)-1] + processed = append(processed, fmt.Sprintf(" // Check for early termination")) + processed = append(processed, fmt.Sprintf(" CMP $%d, %s", inputs, reg)) + processed = append(processed, fmt.Sprintf(" BEQ %s", label)) + processed = append(processed, lastline) + } + } + } + + if matches := reStore.FindAllStringSubmatch(line, -1); len(matches) == 1 { + processed = append(processed, fmt.Sprintf("%s:", label)) + } + + processed = append(processed, line) + } + return +} + +func addEarlyExit(arch string) { + const filename = "../galois_gen_arm64.s" + asmOut := &bytes.Buffer{} + + asmOut.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT.` + "\n\n") + asmOut.WriteString("//go:build !appengine && !noasm && !nogen && !nopshufb && gc\n\n") + asmOut.WriteString(`#include "textflag.h"` + "\n\n") + + input := 10 + for outputs := 1; outputs <= 3; outputs++ { + for op := ""; len(op) <= 3; op += "Xor" { + funcName := fmt.Sprintf("mul%s_%dx%d_64%s", arch, input, outputs, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", funcName) + + lines, _ := extractRoutine(filename, fmt.Sprintf("TEXT ·%s(SB)", funcName)) + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: SVE\n") + + lines = insertEarlyExit(lines, funcName, outputs) + + asmOut.WriteString(strings.Join(lines, "\n")) + asmOut.WriteString("\n\n") + } + } + + for outputs := 4; outputs <= 10; outputs++ { + for op := ""; len(op) <= 3; op += "Xor" { + funcName := fmt.Sprintf("mul%s_%dx%d%s", arch, input, outputs, op) + funcDef := fmt.Sprintf("func %s(matrix []byte, in [][]byte, out [][]byte, start int, n int)", funcName) + + lines, _ := extractRoutine(filename, fmt.Sprintf("TEXT ·%s(SB)", funcName)) + + // prepend output with commented out function definition and comment + asmOut.WriteString(fmt.Sprintf("// %s\n", funcDef)) + asmOut.WriteString("// Requires: SVE\n") + + lines = insertEarlyExit(lines, funcName, outputs) + asmOut.WriteString(strings.Join(lines, "\n")) + asmOut.WriteString("\n\n") + } + } + + if err := os.WriteFile("../galois_gen_arm64.s", asmOut.Bytes(), 0644); err != nil { + log.Fatal(err) + } +} + +func genArmSve() { + fromAvx2ToSve() + addEarlyExit("Sve") +} diff --git a/_gen/gen.go b/_gen/gen.go index b8429072..2fa5797d 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -89,6 +89,11 @@ func main() { genSwitch() genGF16() genGF8() + + if pshufb { + genArmSve() + genArmNeon() + } Generate() } @@ -121,12 +126,48 @@ import ( ) `) + avx2funcs := string(` fAvx2 = galMulSlicesAvx2 + fAvx2Xor = galMulSlicesAvx2Xor +`) + + hasCodeGenImpl := string(` return &fAvx2, &fAvx2Xor, codeGen && pshufb && r.o.useAVX2 && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs`) + + if !pshufb { + avx2funcs = "" + hasCodeGenImpl = ` return nil, nil, false // no code generation for generic case (only GFNI cases) ` + } + w.WriteString(fmt.Sprintf(`const ( -avx2CodeGen = true -maxAvx2Inputs = %d -maxAvx2Outputs = %d -minAvx2Size = 64 -)`, inputMax, outputMax)) +codeGen = true +codeGenMaxGoroutines = 8 +codeGenMaxInputs = %d +codeGenMaxOutputs = %d +minCodeGenSize = 64 +) + +var ( +%s fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { +%s +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +}`, inputMax, outputMax, avx2funcs, hasCodeGenImpl)) if !pshufb { w.WriteString("\n\nfunc galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`)}\n") @@ -413,7 +454,7 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { for _, ptr := range inPtrs { ADDQ(offset, ptr) } - // Offset no longer needed unless not regdst + // Offset no longer needed unless not regDst tmpMask := GP64() MOVQ(U32(15), tmpMask) diff --git a/_gen/go.mod b/_gen/go.mod index f6d1dca4..d1406bf1 100644 --- a/_gen/go.mod +++ b/_gen/go.mod @@ -1,6 +1,6 @@ module github.com/klauspost/reedsolomon/_gen -go 1.19 +go 1.21.5 require ( github.com/klauspost/asmfmt v1.3.1 @@ -8,6 +8,8 @@ require ( ) require ( + github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471 // indirect + github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f // indirect golang.org/x/mod v0.6.0 // indirect golang.org/x/sys v0.1.0 // indirect golang.org/x/tools v0.2.0 // indirect diff --git a/_gen/go.sum b/_gen/go.sum index 5aa25310..4938f100 100644 --- a/_gen/go.sum +++ b/_gen/go.sum @@ -1,3 +1,7 @@ +github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471 h1:omdgAKxePZxbMC7HZPw99QMPeH7fKh3t2QRSZ0YFA/0= +github.com/fwessels/avxTwo2sve v0.0.0-20240611172111-6b8528700471/go.mod h1:9+ibRsEIs0vLXkalKCGEbZfVS4fafeIvMvM9GvIsdeQ= +github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f h1:HQud3yIU82LdkQzHEYiSJs73wCHjprIqeZE9JvSjKbQ= +github.com/fwessels/sve-as v0.0.0-20240611015707-daffc010447f/go.mod h1:j3s7EY79XxNMyjx/54Vo6asZafWU4yijB+KIfj4hrh8= github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw= github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/mmcloughlin/avo v0.5.1-0.20221128045730-bf1d05562091 h1:C2c8ttOBeyhs1SvyCXVPCFd0EqtPiTKGnMWQ+JkM0Lc= diff --git a/galois.go b/galois.go index 697f9ca6..9b363950 100644 --- a/galois.go +++ b/galois.go @@ -910,8 +910,8 @@ func galExp(a byte, n int) byte { return expTable[uint8(logResult)] } -func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { - if !avx2CodeGen { +func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { + if !codeGen { panic("codegen not enabled") } total := inputs * outputs @@ -942,7 +942,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f} func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 { - if !avx2CodeGen { + if !codeGen { panic("codegen not enabled") } total := inputs * outputs diff --git a/galois_amd64_test.go b/galois_amd64_test.go new file mode 100644 index 00000000..23ed18d3 --- /dev/null +++ b/galois_amd64_test.go @@ -0,0 +1,15 @@ +//go:build !noasm && !appengine && !gccgo && !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. + +package reedsolomon + +import ( + "testing" +) + +func TestGenGalois(t *testing.T) { + if defaultOptions.useAVX2 { + testGenGaloisUpto10x10(t, galMulSlicesAvx2, galMulSlicesAvx2Xor) + } +} diff --git a/galois_arm64_test.go b/galois_arm64_test.go new file mode 100644 index 00000000..736d46bd --- /dev/null +++ b/galois_arm64_test.go @@ -0,0 +1,19 @@ +//go:build !noasm && !appengine && !gccgo && !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2024, Minio, Inc. + +package reedsolomon + +import ( + "testing" +) + +func TestGenGalois(t *testing.T) { + if defaultOptions.useSVE { + testGenGaloisUpto10x10(t, galMulSlicesSve, galMulSlicesSveXor) + } + if defaultOptions.useNEON { + testGenGaloisUpto10x10(t, galMulSlicesNeon, galMulSlicesNeonXor) + } +} diff --git a/galois_gen_arm64.go b/galois_gen_arm64.go new file mode 100644 index 00000000..2f871903 --- /dev/null +++ b/galois_gen_arm64.go @@ -0,0 +1,125 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !noasm && !appengine && !gccgo && !nopshufb + +package reedsolomon + +//go:noescape +func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) diff --git a/galois_gen_arm64.s b/galois_gen_arm64.s new file mode 100644 index 00000000..335b94c3 --- /dev/null +++ b/galois_gen_arm64.s @@ -0,0 +1,26958 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && !nopshufb && gc + +#include "textflag.h" + +// func mulSve_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x1_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + WORD $0x8b0f01ce // add x14, x14, x15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd28001ef // mov x15, #15 + WORD $0x05e039e2 // mov z2.d, x15 + WORD $0x05212042 // dup z2.b, z2.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + WORD $0x85804026 // ldr z6, [x1] + WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804043 // ldr z3, [x2] + WORD $0x85804444 // ldr z4, [x2, #1, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33080 // eor z0.d, z4.d, z3.d + WORD $0x04a530c1 // eor z1.d, z6.d, z5.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 1 to 1 outputs + WORD $0x85804086 // ldr z6, [x4] + WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804843 // ldr z3, [x2, #2, MUL VL] + WORD $0x85804c44 // ldr z4, [x2, #3, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 2 to 1 outputs + WORD $0x858040a6 // ldr z6, [x5] + WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805043 // ldr z3, [x2, #4, MUL VL] + WORD $0x85805444 // ldr z4, [x2, #5, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 3 to 1 outputs + WORD $0x85804106 // ldr z6, [x8] + WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805843 // ldr z3, [x2, #6, MUL VL] + WORD $0x85805c44 // ldr z4, [x2, #7, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 4 to 1 outputs + WORD $0x85804126 // ldr z6, [x9] + WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814043 // ldr z3, [x2, #8, MUL VL] + WORD $0x85814444 // ldr z4, [x2, #9, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 5 to 1 outputs + WORD $0x85804146 // ldr z6, [x10] + WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814843 // ldr z3, [x2, #10, MUL VL] + WORD $0x85814c44 // ldr z4, [x2, #11, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 6 to 1 outputs + WORD $0x85804166 // ldr z6, [x11] + WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815043 // ldr z3, [x2, #12, MUL VL] + WORD $0x85815444 // ldr z4, [x2, #13, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 7 to 1 outputs + WORD $0x85804186 // ldr z6, [x12] + WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815843 // ldr z3, [x2, #14, MUL VL] + WORD $0x85815c44 // ldr z4, [x2, #15, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 8 to 1 outputs + WORD $0x858041a6 // ldr z6, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824043 // ldr z3, [x2, #16, MUL VL] + WORD $0x85824444 // ldr z4, [x2, #17, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x1_64_store + + // Load and process 64 bytes from input 9 to 1 outputs + WORD $0x85804066 // ldr z6, [x3] + WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824843 // ldr z3, [x2, #18, MUL VL] + WORD $0x85824c44 // ldr z4, [x2, #19, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + +mulSve_10x1_64_store: + // Store 1 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x1_64_loop + +mulSve_10x1_64_end: + RET + +// func mulSve_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x1_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + WORD $0x8b0f01ce // add x14, x14, x15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd28001ef // mov x15, #15 + WORD $0x05e039e2 // mov z2.d, x15 + WORD $0x05212042 // dup z2.b, z2.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x1_64Xor_loop: + // Load 1 outputs + WORD $0x858041c0 // ldr z0, [x14] + WORD $0x858045c1 // ldr z1, [x14, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 1 outputs + WORD $0x85804026 // ldr z6, [x1] + WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804043 // ldr z3, [x2] + WORD $0x85804444 // ldr z4, [x2, #1, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 1 to 1 outputs + WORD $0x85804086 // ldr z6, [x4] + WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85804843 // ldr z3, [x2, #2, MUL VL] + WORD $0x85804c44 // ldr z4, [x2, #3, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 2 to 1 outputs + WORD $0x858040a6 // ldr z6, [x5] + WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805043 // ldr z3, [x2, #4, MUL VL] + WORD $0x85805444 // ldr z4, [x2, #5, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 3 to 1 outputs + WORD $0x85804106 // ldr z6, [x8] + WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85805843 // ldr z3, [x2, #6, MUL VL] + WORD $0x85805c44 // ldr z4, [x2, #7, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 4 to 1 outputs + WORD $0x85804126 // ldr z6, [x9] + WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814043 // ldr z3, [x2, #8, MUL VL] + WORD $0x85814444 // ldr z4, [x2, #9, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 5 to 1 outputs + WORD $0x85804146 // ldr z6, [x10] + WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85814843 // ldr z3, [x2, #10, MUL VL] + WORD $0x85814c44 // ldr z4, [x2, #11, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 6 to 1 outputs + WORD $0x85804166 // ldr z6, [x11] + WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815043 // ldr z3, [x2, #12, MUL VL] + WORD $0x85815444 // ldr z4, [x2, #13, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 7 to 1 outputs + WORD $0x85804186 // ldr z6, [x12] + WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85815843 // ldr z3, [x2, #14, MUL VL] + WORD $0x85815c44 // ldr z4, [x2, #15, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 8 to 1 outputs + WORD $0x858041a6 // ldr z6, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824043 // ldr z3, [x2, #16, MUL VL] + WORD $0x85824444 // ldr z4, [x2, #17, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x1_64Xor_store + + // Load and process 64 bytes from input 9 to 1 outputs + WORD $0x85804066 // ldr z6, [x3] + WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 + WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 + WORD $0x042230c6 // and z6.d, z6.d, z2.d + WORD $0x042230a5 // and z5.d, z5.d, z2.d + WORD $0x042230e7 // and z7.d, z7.d, z2.d + WORD $0x04223108 // and z8.d, z8.d, z2.d + WORD $0x85824843 // ldr z3, [x2, #18, MUL VL] + WORD $0x85824c44 // ldr z4, [x2, #19, MUL VL] + WORD $0x05253065 // tbl z5.b, z3.b, z5.b + WORD $0x05263063 // tbl z3.b, z3.b, z6.b + WORD $0x05283086 // tbl z6.b, z4.b, z8.b + WORD $0x05273084 // tbl z4.b, z4.b, z7.b + WORD $0x04a33000 // eor z0.d, z0.d, z3.d + WORD $0x04a43000 // eor z0.d, z0.d, z4.d + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + +mulSve_10x1_64Xor_store: + // Store 1 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x1_64Xor_loop + +mulSve_10x1_64Xor_end: + RET + +// func mulSve_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x2_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ce // add x14, x14, x6 + + // Add start offset to input + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b0601ad // add x13, x13, x6 + WORD $0x8b060063 // add x3, x3, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a530c0 // eor z0.d, z6.d, z5.d + WORD $0x04a73101 // eor z1.d, z8.d, z7.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a530c2 // eor z2.d, z6.d, z5.d + WORD $0x04a73103 // eor z3.d, z8.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 1 to 2 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 2 to 2 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 3 to 2 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 4 to 2 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 5 to 2 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 6 to 2 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 7 to 2 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 8 to 2 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x2_64_store + + // Load and process 64 bytes from input 9 to 2 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + +mulSve_10x2_64_store: + // Store 2 outputs + WORD $0xe58041e0 // str z0, [x15] + WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041c2 // str z2, [x14] + WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x2_64_loop + +mulSve_10x2_64_end: + RET + +// func mulSve_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x2_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ce // add x14, x14, x6 + + // Add start offset to input + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b0601ad // add x13, x13, x6 + WORD $0x8b060063 // add x3, x3, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x2_64Xor_loop: + // Load 2 outputs + WORD $0x858041e0 // ldr z0, [x15] + WORD $0x858045e1 // ldr z1, [x15, #1, MUL VL] + WORD $0x858041c2 // ldr z2, [x14] + WORD $0x858045c3 // ldr z3, [x14, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 2 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 1 to 2 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 2 to 2 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 3 to 2 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 4 to 2 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 5 to 2 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 6 to 2 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 7 to 2 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 8 to 2 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x2_64Xor_store + + // Load and process 64 bytes from input 9 to 2 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04243129 // and z9.d, z9.d, z4.d + WORD $0x0424316b // and z11.d, z11.d, z4.d + WORD $0x0424314a // and z10.d, z10.d, z4.d + WORD $0x0424318c // and z12.d, z12.d, z4.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052b30a7 // tbl z7.b, z5.b, z11.b + WORD $0x052930a5 // tbl z5.b, z5.b, z9.b + WORD $0x052c30c8 // tbl z8.b, z6.b, z12.b + WORD $0x052a30c6 // tbl z6.b, z6.b, z10.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + +mulSve_10x2_64Xor_store: + // Store 2 outputs + WORD $0xe58041e0 // str z0, [x15] + WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041c2 // str z2, [x14] + WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + + // Prepare for next loop + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x2_64Xor_loop + +mulSve_10x2_64Xor_end: + RET + +// func mulSve_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x3_64_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ce // add x14, x14, x6 + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ad // add x13, x13, x6 + + // Add start offset to input + WORD $0x8b060063 // add x3, x3, x6 + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b060000 // add x0, x0, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Reload length to save a register + MOVD n+80(FP), R6 + WORD $0xd346fcc6 // lsr x6, x6, #6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73100 // eor z0.d, z8.d, z7.d + WORD $0x04a93141 // eor z1.d, z10.d, z9.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73102 // eor z2.d, z8.d, z7.d + WORD $0x04a93143 // eor z3.d, z10.d, z9.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73104 // eor z4.d, z8.d, z7.d + WORD $0x04a93145 // eor z5.d, z10.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 1 to 3 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 2 to 3 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 3 to 3 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 4 to 3 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 5 to 3 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 6 to 3 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 7 to 3 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 8 to 3 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x3_64_store + + // Load and process 64 bytes from input 9 to 3 outputs + WORD $0x8580400b // ldr z11, [x0] + WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] + WORD $0x91010000 // add x0, x0, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + +mulSve_10x3_64_store: + // Store 3 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + WORD $0xe58041e2 // str z2, [x15] + WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041a4 // str z4, [x13] + WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + + // Prepare for next loop + WORD $0xf10004c6 // subs x6, x6, #1 + BNE mulSve_10x3_64_loop + +mulSve_10x3_64_end: + RET + +// func mulSve_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x3_64Xor_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + WORD $0x8b0601ce // add x14, x14, x6 + WORD $0x8b0601ef // add x15, x15, x6 + WORD $0x8b0601ad // add x13, x13, x6 + + // Add start offset to input + WORD $0x8b060063 // add x3, x3, x6 + WORD $0x8b060021 // add x1, x1, x6 + WORD $0x8b060084 // add x4, x4, x6 + WORD $0x8b0600a5 // add x5, x5, x6 + WORD $0x8b060108 // add x8, x8, x6 + WORD $0x8b060129 // add x9, x9, x6 + WORD $0x8b06014a // add x10, x10, x6 + WORD $0x8b06016b // add x11, x11, x6 + WORD $0x8b06018c // add x12, x12, x6 + WORD $0x8b060000 // add x0, x0, x6 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Reload length to save a register + MOVD n+80(FP), R6 + WORD $0xd346fcc6 // lsr x6, x6, #6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x3_64Xor_loop: + // Load 3 outputs + WORD $0x858041c0 // ldr z0, [x14] + WORD $0x858045c1 // ldr z1, [x14, #1, MUL VL] + WORD $0x858041e2 // ldr z2, [x15] + WORD $0x858045e3 // ldr z3, [x15, #1, MUL VL] + WORD $0x858041a4 // ldr z4, [x13] + WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] + + // Load and process 64 bytes from input 0 to 3 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] + WORD $0x91010063 // add x3, x3, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 1 to 3 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] + WORD $0x91010021 // add x1, x1, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 2 to 3 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] + WORD $0x91010084 // add x4, x4, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 3 to 3 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] + WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 4 to 3 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] + WORD $0x91010108 // add x8, x8, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 5 to 3 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] + WORD $0x91010129 // add x9, x9, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 6 to 3 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] + WORD $0x9101014a // add x10, x10, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 7 to 3 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] + WORD $0x9101016b // add x11, x11, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 8 to 3 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] + WORD $0x9101018c // add x12, x12, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x3_64Xor_store + + // Load and process 64 bytes from input 9 to 3 outputs + WORD $0x8580400b // ldr z11, [x0] + WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] + WORD $0x91010000 // add x0, x0, #64 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x0426316b // and z11.d, z11.d, z6.d + WORD $0x042631ad // and z13.d, z13.d, z6.d + WORD $0x0426318c // and z12.d, z12.d, z6.d + WORD $0x042631ce // and z14.d, z14.d, z6.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052d30e9 // tbl z9.b, z7.b, z13.b + WORD $0x052b30e7 // tbl z7.b, z7.b, z11.b + WORD $0x052e310a // tbl z10.b, z8.b, z14.b + WORD $0x052c3108 // tbl z8.b, z8.b, z12.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + +mulSve_10x3_64Xor_store: + // Store 3 outputs + WORD $0xe58041c0 // str z0, [x14] + WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] + WORD $0x910101ce // add x14, x14, #64 + WORD $0xe58041e2 // str z2, [x15] + WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] + WORD $0x910101ef // add x15, x15, #64 + WORD $0xe58041a4 // str z4, [x13] + WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] + WORD $0x910101ad // add x13, x13, #64 + + // Prepare for next loop + WORD $0xf10004c6 // subs x6, x6, #1 + BNE mulSve_10x3_64Xor_loop + +mulSve_10x3_64Xor_end: + RET + +// func mulSve_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x4_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + WORD $0x85804027 // ldr z7, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c0 // eor z0.d, z6.d, z5.d + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c1 // eor z1.d, z6.d, z5.d + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c2 // eor z2.d, z6.d, z5.d + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a530c3 // eor z3.d, z6.d, z5.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 1 to 4 outputs + WORD $0x85804087 // ldr z7, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 2 to 4 outputs + WORD $0x858040a7 // ldr z7, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 3 to 4 outputs + WORD $0x85804107 // ldr z7, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 4 to 4 outputs + WORD $0x85804127 // ldr z7, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 5 to 4 outputs + WORD $0x85804147 // ldr z7, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85854045 // ldr z5, [x2, #40, MUL VL] + WORD $0x85854446 // ldr z6, [x2, #41, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85854845 // ldr z5, [x2, #42, MUL VL] + WORD $0x85854c46 // ldr z6, [x2, #43, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85855045 // ldr z5, [x2, #44, MUL VL] + WORD $0x85855446 // ldr z6, [x2, #45, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85855845 // ldr z5, [x2, #46, MUL VL] + WORD $0x85855c46 // ldr z6, [x2, #47, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 6 to 4 outputs + WORD $0x85804167 // ldr z7, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85864045 // ldr z5, [x2, #48, MUL VL] + WORD $0x85864446 // ldr z6, [x2, #49, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85864845 // ldr z5, [x2, #50, MUL VL] + WORD $0x85864c46 // ldr z6, [x2, #51, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85865045 // ldr z5, [x2, #52, MUL VL] + WORD $0x85865446 // ldr z6, [x2, #53, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85865845 // ldr z5, [x2, #54, MUL VL] + WORD $0x85865c46 // ldr z6, [x2, #55, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 7 to 4 outputs + WORD $0x85804187 // ldr z7, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85874045 // ldr z5, [x2, #56, MUL VL] + WORD $0x85874446 // ldr z6, [x2, #57, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85874845 // ldr z5, [x2, #58, MUL VL] + WORD $0x85874c46 // ldr z6, [x2, #59, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85875045 // ldr z5, [x2, #60, MUL VL] + WORD $0x85875446 // ldr z6, [x2, #61, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85875845 // ldr z5, [x2, #62, MUL VL] + WORD $0x85875c46 // ldr z6, [x2, #63, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 8 to 4 outputs + WORD $0x858041a7 // ldr z7, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85884045 // ldr z5, [x2, #64, MUL VL] + WORD $0x85884446 // ldr z6, [x2, #65, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85884845 // ldr z5, [x2, #66, MUL VL] + WORD $0x85884c46 // ldr z6, [x2, #67, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85885045 // ldr z5, [x2, #68, MUL VL] + WORD $0x85885446 // ldr z6, [x2, #69, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85885845 // ldr z5, [x2, #70, MUL VL] + WORD $0x85885c46 // ldr z6, [x2, #71, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x4_store + + // Load and process 32 bytes from input 9 to 4 outputs + WORD $0x85804067 // ldr z7, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85894045 // ldr z5, [x2, #72, MUL VL] + WORD $0x85894446 // ldr z6, [x2, #73, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85894845 // ldr z5, [x2, #74, MUL VL] + WORD $0x85894c46 // ldr z6, [x2, #75, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85895045 // ldr z5, [x2, #76, MUL VL] + WORD $0x85895446 // ldr z6, [x2, #77, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85895845 // ldr z5, [x2, #78, MUL VL] + WORD $0x85895c46 // ldr z6, [x2, #79, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + +mulSve_10x4_store: + // Store 4 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x4_loop + +mulSve_10x4_end: + RET + +// func mulSve_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x4Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c4 // mov z4.d, x6 + WORD $0x05212084 // dup z4.b, z4.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + WORD $0x85804027 // ldr z7, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804045 // ldr z5, [x2] + WORD $0x85804446 // ldr z6, [x2, #1, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804845 // ldr z5, [x2, #2, MUL VL] + WORD $0x85804c46 // ldr z6, [x2, #3, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805045 // ldr z5, [x2, #4, MUL VL] + WORD $0x85805446 // ldr z6, [x2, #5, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805845 // ldr z5, [x2, #6, MUL VL] + WORD $0x85805c46 // ldr z6, [x2, #7, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 1 to 4 outputs + WORD $0x85804087 // ldr z7, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85814045 // ldr z5, [x2, #8, MUL VL] + WORD $0x85814446 // ldr z6, [x2, #9, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85814845 // ldr z5, [x2, #10, MUL VL] + WORD $0x85814c46 // ldr z6, [x2, #11, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85815045 // ldr z5, [x2, #12, MUL VL] + WORD $0x85815446 // ldr z6, [x2, #13, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85815845 // ldr z5, [x2, #14, MUL VL] + WORD $0x85815c46 // ldr z6, [x2, #15, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 2 to 4 outputs + WORD $0x858040a7 // ldr z7, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85824045 // ldr z5, [x2, #16, MUL VL] + WORD $0x85824446 // ldr z6, [x2, #17, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85824845 // ldr z5, [x2, #18, MUL VL] + WORD $0x85824c46 // ldr z6, [x2, #19, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85825045 // ldr z5, [x2, #20, MUL VL] + WORD $0x85825446 // ldr z6, [x2, #21, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85825845 // ldr z5, [x2, #22, MUL VL] + WORD $0x85825c46 // ldr z6, [x2, #23, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 3 to 4 outputs + WORD $0x85804107 // ldr z7, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85834045 // ldr z5, [x2, #24, MUL VL] + WORD $0x85834446 // ldr z6, [x2, #25, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85834845 // ldr z5, [x2, #26, MUL VL] + WORD $0x85834c46 // ldr z6, [x2, #27, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85835045 // ldr z5, [x2, #28, MUL VL] + WORD $0x85835446 // ldr z6, [x2, #29, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85835845 // ldr z5, [x2, #30, MUL VL] + WORD $0x85835c46 // ldr z6, [x2, #31, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 4 to 4 outputs + WORD $0x85804127 // ldr z7, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85844045 // ldr z5, [x2, #32, MUL VL] + WORD $0x85844446 // ldr z6, [x2, #33, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85844845 // ldr z5, [x2, #34, MUL VL] + WORD $0x85844c46 // ldr z6, [x2, #35, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85845045 // ldr z5, [x2, #36, MUL VL] + WORD $0x85845446 // ldr z6, [x2, #37, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85845845 // ldr z5, [x2, #38, MUL VL] + WORD $0x85845c46 // ldr z6, [x2, #39, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 5 to 4 outputs + WORD $0x85804147 // ldr z7, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85854045 // ldr z5, [x2, #40, MUL VL] + WORD $0x85854446 // ldr z6, [x2, #41, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85854845 // ldr z5, [x2, #42, MUL VL] + WORD $0x85854c46 // ldr z6, [x2, #43, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85855045 // ldr z5, [x2, #44, MUL VL] + WORD $0x85855446 // ldr z6, [x2, #45, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85855845 // ldr z5, [x2, #46, MUL VL] + WORD $0x85855c46 // ldr z6, [x2, #47, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 6 to 4 outputs + WORD $0x85804167 // ldr z7, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85864045 // ldr z5, [x2, #48, MUL VL] + WORD $0x85864446 // ldr z6, [x2, #49, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85864845 // ldr z5, [x2, #50, MUL VL] + WORD $0x85864c46 // ldr z6, [x2, #51, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85865045 // ldr z5, [x2, #52, MUL VL] + WORD $0x85865446 // ldr z6, [x2, #53, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85865845 // ldr z5, [x2, #54, MUL VL] + WORD $0x85865c46 // ldr z6, [x2, #55, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 7 to 4 outputs + WORD $0x85804187 // ldr z7, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85874045 // ldr z5, [x2, #56, MUL VL] + WORD $0x85874446 // ldr z6, [x2, #57, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85874845 // ldr z5, [x2, #58, MUL VL] + WORD $0x85874c46 // ldr z6, [x2, #59, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85875045 // ldr z5, [x2, #60, MUL VL] + WORD $0x85875446 // ldr z6, [x2, #61, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85875845 // ldr z5, [x2, #62, MUL VL] + WORD $0x85875c46 // ldr z6, [x2, #63, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 8 to 4 outputs + WORD $0x858041a7 // ldr z7, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85884045 // ldr z5, [x2, #64, MUL VL] + WORD $0x85884446 // ldr z6, [x2, #65, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85884845 // ldr z5, [x2, #66, MUL VL] + WORD $0x85884c46 // ldr z6, [x2, #67, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85885045 // ldr z5, [x2, #68, MUL VL] + WORD $0x85885446 // ldr z6, [x2, #69, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85885845 // ldr z5, [x2, #70, MUL VL] + WORD $0x85885c46 // ldr z6, [x2, #71, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x4Xor_store + + // Load and process 32 bytes from input 9 to 4 outputs + WORD $0x85804067 // ldr z7, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 + WORD $0x042430e7 // and z7.d, z7.d, z4.d + WORD $0x04243108 // and z8.d, z8.d, z4.d + WORD $0x85894045 // ldr z5, [x2, #72, MUL VL] + WORD $0x85894446 // ldr z6, [x2, #73, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53000 // eor z0.d, z0.d, z5.d + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x85894845 // ldr z5, [x2, #74, MUL VL] + WORD $0x85894c46 // ldr z6, [x2, #75, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53021 // eor z1.d, z1.d, z5.d + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x85895045 // ldr z5, [x2, #76, MUL VL] + WORD $0x85895446 // ldr z6, [x2, #77, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53042 // eor z2.d, z2.d, z5.d + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x85895845 // ldr z5, [x2, #78, MUL VL] + WORD $0x85895c46 // ldr z6, [x2, #79, MUL VL] + WORD $0x052730a5 // tbl z5.b, z5.b, z7.b + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x04a53063 // eor z3.d, z3.d, z5.d + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + +mulSve_10x4Xor_store: + // Store 4 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x4Xor_loop + +mulSve_10x4Xor_end: + RET + +// func mulSve_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x5_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c5 // mov z5.d, x6 + WORD $0x052120a5 // dup z5.b, z5.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + WORD $0x85804028 // ldr z8, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85804046 // ldr z6, [x2] + WORD $0x85804447 // ldr z7, [x2, #1, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e0 // eor z0.d, z7.d, z6.d + WORD $0x85804846 // ldr z6, [x2, #2, MUL VL] + WORD $0x85804c47 // ldr z7, [x2, #3, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e1 // eor z1.d, z7.d, z6.d + WORD $0x85805046 // ldr z6, [x2, #4, MUL VL] + WORD $0x85805447 // ldr z7, [x2, #5, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e2 // eor z2.d, z7.d, z6.d + WORD $0x85805846 // ldr z6, [x2, #6, MUL VL] + WORD $0x85805c47 // ldr z7, [x2, #7, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e3 // eor z3.d, z7.d, z6.d + WORD $0x85814046 // ldr z6, [x2, #8, MUL VL] + WORD $0x85814447 // ldr z7, [x2, #9, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a630e4 // eor z4.d, z7.d, z6.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 1 to 5 outputs + WORD $0x85804088 // ldr z8, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85814846 // ldr z6, [x2, #10, MUL VL] + WORD $0x85814c47 // ldr z7, [x2, #11, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85815046 // ldr z6, [x2, #12, MUL VL] + WORD $0x85815447 // ldr z7, [x2, #13, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85815846 // ldr z6, [x2, #14, MUL VL] + WORD $0x85815c47 // ldr z7, [x2, #15, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85824046 // ldr z6, [x2, #16, MUL VL] + WORD $0x85824447 // ldr z7, [x2, #17, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85824846 // ldr z6, [x2, #18, MUL VL] + WORD $0x85824c47 // ldr z7, [x2, #19, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 2 to 5 outputs + WORD $0x858040a8 // ldr z8, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85825046 // ldr z6, [x2, #20, MUL VL] + WORD $0x85825447 // ldr z7, [x2, #21, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85825846 // ldr z6, [x2, #22, MUL VL] + WORD $0x85825c47 // ldr z7, [x2, #23, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85834046 // ldr z6, [x2, #24, MUL VL] + WORD $0x85834447 // ldr z7, [x2, #25, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85834846 // ldr z6, [x2, #26, MUL VL] + WORD $0x85834c47 // ldr z7, [x2, #27, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85835046 // ldr z6, [x2, #28, MUL VL] + WORD $0x85835447 // ldr z7, [x2, #29, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 3 to 5 outputs + WORD $0x85804108 // ldr z8, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85835846 // ldr z6, [x2, #30, MUL VL] + WORD $0x85835c47 // ldr z7, [x2, #31, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85844046 // ldr z6, [x2, #32, MUL VL] + WORD $0x85844447 // ldr z7, [x2, #33, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85844846 // ldr z6, [x2, #34, MUL VL] + WORD $0x85844c47 // ldr z7, [x2, #35, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85845046 // ldr z6, [x2, #36, MUL VL] + WORD $0x85845447 // ldr z7, [x2, #37, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85845846 // ldr z6, [x2, #38, MUL VL] + WORD $0x85845c47 // ldr z7, [x2, #39, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 4 to 5 outputs + WORD $0x85804128 // ldr z8, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85854046 // ldr z6, [x2, #40, MUL VL] + WORD $0x85854447 // ldr z7, [x2, #41, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85854846 // ldr z6, [x2, #42, MUL VL] + WORD $0x85854c47 // ldr z7, [x2, #43, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85855046 // ldr z6, [x2, #44, MUL VL] + WORD $0x85855447 // ldr z7, [x2, #45, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85855846 // ldr z6, [x2, #46, MUL VL] + WORD $0x85855c47 // ldr z7, [x2, #47, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85864046 // ldr z6, [x2, #48, MUL VL] + WORD $0x85864447 // ldr z7, [x2, #49, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 5 to 5 outputs + WORD $0x85804148 // ldr z8, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85864846 // ldr z6, [x2, #50, MUL VL] + WORD $0x85864c47 // ldr z7, [x2, #51, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85865046 // ldr z6, [x2, #52, MUL VL] + WORD $0x85865447 // ldr z7, [x2, #53, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85865846 // ldr z6, [x2, #54, MUL VL] + WORD $0x85865c47 // ldr z7, [x2, #55, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85874046 // ldr z6, [x2, #56, MUL VL] + WORD $0x85874447 // ldr z7, [x2, #57, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85874846 // ldr z6, [x2, #58, MUL VL] + WORD $0x85874c47 // ldr z7, [x2, #59, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 6 to 5 outputs + WORD $0x85804168 // ldr z8, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85875046 // ldr z6, [x2, #60, MUL VL] + WORD $0x85875447 // ldr z7, [x2, #61, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85875846 // ldr z6, [x2, #62, MUL VL] + WORD $0x85875c47 // ldr z7, [x2, #63, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85884046 // ldr z6, [x2, #64, MUL VL] + WORD $0x85884447 // ldr z7, [x2, #65, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85884846 // ldr z6, [x2, #66, MUL VL] + WORD $0x85884c47 // ldr z7, [x2, #67, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85885046 // ldr z6, [x2, #68, MUL VL] + WORD $0x85885447 // ldr z7, [x2, #69, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 7 to 5 outputs + WORD $0x85804188 // ldr z8, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85885846 // ldr z6, [x2, #70, MUL VL] + WORD $0x85885c47 // ldr z7, [x2, #71, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85894046 // ldr z6, [x2, #72, MUL VL] + WORD $0x85894447 // ldr z7, [x2, #73, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85894846 // ldr z6, [x2, #74, MUL VL] + WORD $0x85894c47 // ldr z7, [x2, #75, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85895046 // ldr z6, [x2, #76, MUL VL] + WORD $0x85895447 // ldr z7, [x2, #77, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85895846 // ldr z6, [x2, #78, MUL VL] + WORD $0x85895c47 // ldr z7, [x2, #79, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 8 to 5 outputs + WORD $0x858041a8 // ldr z8, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858a4046 // ldr z6, [x2, #80, MUL VL] + WORD $0x858a4447 // ldr z7, [x2, #81, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858a4846 // ldr z6, [x2, #82, MUL VL] + WORD $0x858a4c47 // ldr z7, [x2, #83, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858a5046 // ldr z6, [x2, #84, MUL VL] + WORD $0x858a5447 // ldr z7, [x2, #85, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858a5846 // ldr z6, [x2, #86, MUL VL] + WORD $0x858a5c47 // ldr z7, [x2, #87, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858b4046 // ldr z6, [x2, #88, MUL VL] + WORD $0x858b4447 // ldr z7, [x2, #89, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x5_store + + // Load and process 32 bytes from input 9 to 5 outputs + WORD $0x85804068 // ldr z8, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858b4846 // ldr z6, [x2, #90, MUL VL] + WORD $0x858b4c47 // ldr z7, [x2, #91, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858b5046 // ldr z6, [x2, #92, MUL VL] + WORD $0x858b5447 // ldr z7, [x2, #93, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858b5846 // ldr z6, [x2, #94, MUL VL] + WORD $0x858b5c47 // ldr z7, [x2, #95, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858c4046 // ldr z6, [x2, #96, MUL VL] + WORD $0x858c4447 // ldr z7, [x2, #97, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858c4846 // ldr z6, [x2, #98, MUL VL] + WORD $0x858c4c47 // ldr z7, [x2, #99, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + +mulSve_10x5_store: + // Store 5 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x5_loop + +mulSve_10x5_end: + RET + +// func mulSve_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x5Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c5 // mov z5.d, x6 + WORD $0x052120a5 // dup z5.b, z5.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + WORD $0x85804028 // ldr z8, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804046 // ldr z6, [x2] + WORD $0x85804447 // ldr z7, [x2, #1, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804846 // ldr z6, [x2, #2, MUL VL] + WORD $0x85804c47 // ldr z7, [x2, #3, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805046 // ldr z6, [x2, #4, MUL VL] + WORD $0x85805447 // ldr z7, [x2, #5, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805846 // ldr z6, [x2, #6, MUL VL] + WORD $0x85805c47 // ldr z7, [x2, #7, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814046 // ldr z6, [x2, #8, MUL VL] + WORD $0x85814447 // ldr z7, [x2, #9, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 1 to 5 outputs + WORD $0x85804088 // ldr z8, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85814846 // ldr z6, [x2, #10, MUL VL] + WORD $0x85814c47 // ldr z7, [x2, #11, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85815046 // ldr z6, [x2, #12, MUL VL] + WORD $0x85815447 // ldr z7, [x2, #13, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85815846 // ldr z6, [x2, #14, MUL VL] + WORD $0x85815c47 // ldr z7, [x2, #15, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85824046 // ldr z6, [x2, #16, MUL VL] + WORD $0x85824447 // ldr z7, [x2, #17, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85824846 // ldr z6, [x2, #18, MUL VL] + WORD $0x85824c47 // ldr z7, [x2, #19, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 2 to 5 outputs + WORD $0x858040a8 // ldr z8, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85825046 // ldr z6, [x2, #20, MUL VL] + WORD $0x85825447 // ldr z7, [x2, #21, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85825846 // ldr z6, [x2, #22, MUL VL] + WORD $0x85825c47 // ldr z7, [x2, #23, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85834046 // ldr z6, [x2, #24, MUL VL] + WORD $0x85834447 // ldr z7, [x2, #25, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85834846 // ldr z6, [x2, #26, MUL VL] + WORD $0x85834c47 // ldr z7, [x2, #27, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85835046 // ldr z6, [x2, #28, MUL VL] + WORD $0x85835447 // ldr z7, [x2, #29, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 3 to 5 outputs + WORD $0x85804108 // ldr z8, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85835846 // ldr z6, [x2, #30, MUL VL] + WORD $0x85835c47 // ldr z7, [x2, #31, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85844046 // ldr z6, [x2, #32, MUL VL] + WORD $0x85844447 // ldr z7, [x2, #33, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85844846 // ldr z6, [x2, #34, MUL VL] + WORD $0x85844c47 // ldr z7, [x2, #35, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85845046 // ldr z6, [x2, #36, MUL VL] + WORD $0x85845447 // ldr z7, [x2, #37, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85845846 // ldr z6, [x2, #38, MUL VL] + WORD $0x85845c47 // ldr z7, [x2, #39, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 4 to 5 outputs + WORD $0x85804128 // ldr z8, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85854046 // ldr z6, [x2, #40, MUL VL] + WORD $0x85854447 // ldr z7, [x2, #41, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85854846 // ldr z6, [x2, #42, MUL VL] + WORD $0x85854c47 // ldr z7, [x2, #43, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85855046 // ldr z6, [x2, #44, MUL VL] + WORD $0x85855447 // ldr z7, [x2, #45, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85855846 // ldr z6, [x2, #46, MUL VL] + WORD $0x85855c47 // ldr z7, [x2, #47, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85864046 // ldr z6, [x2, #48, MUL VL] + WORD $0x85864447 // ldr z7, [x2, #49, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 5 to 5 outputs + WORD $0x85804148 // ldr z8, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85864846 // ldr z6, [x2, #50, MUL VL] + WORD $0x85864c47 // ldr z7, [x2, #51, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85865046 // ldr z6, [x2, #52, MUL VL] + WORD $0x85865447 // ldr z7, [x2, #53, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85865846 // ldr z6, [x2, #54, MUL VL] + WORD $0x85865c47 // ldr z7, [x2, #55, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85874046 // ldr z6, [x2, #56, MUL VL] + WORD $0x85874447 // ldr z7, [x2, #57, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85874846 // ldr z6, [x2, #58, MUL VL] + WORD $0x85874c47 // ldr z7, [x2, #59, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 6 to 5 outputs + WORD $0x85804168 // ldr z8, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85875046 // ldr z6, [x2, #60, MUL VL] + WORD $0x85875447 // ldr z7, [x2, #61, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85875846 // ldr z6, [x2, #62, MUL VL] + WORD $0x85875c47 // ldr z7, [x2, #63, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85884046 // ldr z6, [x2, #64, MUL VL] + WORD $0x85884447 // ldr z7, [x2, #65, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85884846 // ldr z6, [x2, #66, MUL VL] + WORD $0x85884c47 // ldr z7, [x2, #67, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85885046 // ldr z6, [x2, #68, MUL VL] + WORD $0x85885447 // ldr z7, [x2, #69, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 7 to 5 outputs + WORD $0x85804188 // ldr z8, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x85885846 // ldr z6, [x2, #70, MUL VL] + WORD $0x85885c47 // ldr z7, [x2, #71, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x85894046 // ldr z6, [x2, #72, MUL VL] + WORD $0x85894447 // ldr z7, [x2, #73, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x85894846 // ldr z6, [x2, #74, MUL VL] + WORD $0x85894c47 // ldr z7, [x2, #75, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x85895046 // ldr z6, [x2, #76, MUL VL] + WORD $0x85895447 // ldr z7, [x2, #77, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x85895846 // ldr z6, [x2, #78, MUL VL] + WORD $0x85895c47 // ldr z7, [x2, #79, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 8 to 5 outputs + WORD $0x858041a8 // ldr z8, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858a4046 // ldr z6, [x2, #80, MUL VL] + WORD $0x858a4447 // ldr z7, [x2, #81, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858a4846 // ldr z6, [x2, #82, MUL VL] + WORD $0x858a4c47 // ldr z7, [x2, #83, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858a5046 // ldr z6, [x2, #84, MUL VL] + WORD $0x858a5447 // ldr z7, [x2, #85, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858a5846 // ldr z6, [x2, #86, MUL VL] + WORD $0x858a5c47 // ldr z7, [x2, #87, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858b4046 // ldr z6, [x2, #88, MUL VL] + WORD $0x858b4447 // ldr z7, [x2, #89, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x5Xor_store + + // Load and process 32 bytes from input 9 to 5 outputs + WORD $0x85804068 // ldr z8, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc9509 // lsr z9.d, z8.d, #4 + WORD $0x04253108 // and z8.d, z8.d, z5.d + WORD $0x04253129 // and z9.d, z9.d, z5.d + WORD $0x858b4846 // ldr z6, [x2, #90, MUL VL] + WORD $0x858b4c47 // ldr z7, [x2, #91, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63000 // eor z0.d, z0.d, z6.d + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x858b5046 // ldr z6, [x2, #92, MUL VL] + WORD $0x858b5447 // ldr z7, [x2, #93, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63021 // eor z1.d, z1.d, z6.d + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x858b5846 // ldr z6, [x2, #94, MUL VL] + WORD $0x858b5c47 // ldr z7, [x2, #95, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63042 // eor z2.d, z2.d, z6.d + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x858c4046 // ldr z6, [x2, #96, MUL VL] + WORD $0x858c4447 // ldr z7, [x2, #97, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63063 // eor z3.d, z3.d, z6.d + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x858c4846 // ldr z6, [x2, #98, MUL VL] + WORD $0x858c4c47 // ldr z7, [x2, #99, MUL VL] + WORD $0x052830c6 // tbl z6.b, z6.b, z8.b + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x04a63084 // eor z4.d, z4.d, z6.d + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + +mulSve_10x5Xor_store: + // Store 5 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x5Xor_loop + +mulSve_10x5Xor_end: + RET + +// func mulSve_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x6_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73100 // eor z0.d, z8.d, z7.d + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73101 // eor z1.d, z8.d, z7.d + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73102 // eor z2.d, z8.d, z7.d + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73103 // eor z3.d, z8.d, z7.d + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73104 // eor z4.d, z8.d, z7.d + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73105 // eor z5.d, z8.d, z7.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 1 to 6 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 2 to 6 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 3 to 6 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 4 to 6 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 5 to 6 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85875047 // ldr z7, [x2, #60, MUL VL] + WORD $0x85875448 // ldr z8, [x2, #61, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85875847 // ldr z7, [x2, #62, MUL VL] + WORD $0x85875c48 // ldr z8, [x2, #63, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85884047 // ldr z7, [x2, #64, MUL VL] + WORD $0x85884448 // ldr z8, [x2, #65, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85884847 // ldr z7, [x2, #66, MUL VL] + WORD $0x85884c48 // ldr z8, [x2, #67, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85885047 // ldr z7, [x2, #68, MUL VL] + WORD $0x85885448 // ldr z8, [x2, #69, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85885847 // ldr z7, [x2, #70, MUL VL] + WORD $0x85885c48 // ldr z8, [x2, #71, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 6 to 6 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85894047 // ldr z7, [x2, #72, MUL VL] + WORD $0x85894448 // ldr z8, [x2, #73, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85894847 // ldr z7, [x2, #74, MUL VL] + WORD $0x85894c48 // ldr z8, [x2, #75, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85895047 // ldr z7, [x2, #76, MUL VL] + WORD $0x85895448 // ldr z8, [x2, #77, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85895847 // ldr z7, [x2, #78, MUL VL] + WORD $0x85895c48 // ldr z8, [x2, #79, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858a4047 // ldr z7, [x2, #80, MUL VL] + WORD $0x858a4448 // ldr z8, [x2, #81, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858a4847 // ldr z7, [x2, #82, MUL VL] + WORD $0x858a4c48 // ldr z8, [x2, #83, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 7 to 6 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858a5047 // ldr z7, [x2, #84, MUL VL] + WORD $0x858a5448 // ldr z8, [x2, #85, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858a5847 // ldr z7, [x2, #86, MUL VL] + WORD $0x858a5c48 // ldr z8, [x2, #87, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858b4047 // ldr z7, [x2, #88, MUL VL] + WORD $0x858b4448 // ldr z8, [x2, #89, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858b4847 // ldr z7, [x2, #90, MUL VL] + WORD $0x858b4c48 // ldr z8, [x2, #91, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858b5047 // ldr z7, [x2, #92, MUL VL] + WORD $0x858b5448 // ldr z8, [x2, #93, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858b5847 // ldr z7, [x2, #94, MUL VL] + WORD $0x858b5c48 // ldr z8, [x2, #95, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 8 to 6 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858c4047 // ldr z7, [x2, #96, MUL VL] + WORD $0x858c4448 // ldr z8, [x2, #97, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858c4847 // ldr z7, [x2, #98, MUL VL] + WORD $0x858c4c48 // ldr z8, [x2, #99, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858c5047 // ldr z7, [x2, #100, MUL VL] + WORD $0x858c5448 // ldr z8, [x2, #101, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858c5847 // ldr z7, [x2, #102, MUL VL] + WORD $0x858c5c48 // ldr z8, [x2, #103, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858d4047 // ldr z7, [x2, #104, MUL VL] + WORD $0x858d4448 // ldr z8, [x2, #105, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858d4847 // ldr z7, [x2, #106, MUL VL] + WORD $0x858d4c48 // ldr z8, [x2, #107, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x6_store + + // Load and process 32 bytes from input 9 to 6 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858d5047 // ldr z7, [x2, #108, MUL VL] + WORD $0x858d5448 // ldr z8, [x2, #109, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858d5847 // ldr z7, [x2, #110, MUL VL] + WORD $0x858d5c48 // ldr z8, [x2, #111, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858e4047 // ldr z7, [x2, #112, MUL VL] + WORD $0x858e4448 // ldr z8, [x2, #113, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858e4847 // ldr z7, [x2, #114, MUL VL] + WORD $0x858e4c48 // ldr z8, [x2, #115, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858e5047 // ldr z7, [x2, #116, MUL VL] + WORD $0x858e5448 // ldr z8, [x2, #117, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858e5847 // ldr z7, [x2, #118, MUL VL] + WORD $0x858e5c48 // ldr z8, [x2, #119, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + +mulSve_10x6_store: + // Store 6 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x6_loop + +mulSve_10x6_end: + RET + +// func mulSve_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x6Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c6 // mov z6.d, x6 + WORD $0x052120c6 // dup z6.b, z6.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + WORD $0x85804029 // ldr z9, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804047 // ldr z7, [x2] + WORD $0x85804448 // ldr z8, [x2, #1, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804847 // ldr z7, [x2, #2, MUL VL] + WORD $0x85804c48 // ldr z8, [x2, #3, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805047 // ldr z7, [x2, #4, MUL VL] + WORD $0x85805448 // ldr z8, [x2, #5, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805847 // ldr z7, [x2, #6, MUL VL] + WORD $0x85805c48 // ldr z8, [x2, #7, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814047 // ldr z7, [x2, #8, MUL VL] + WORD $0x85814448 // ldr z8, [x2, #9, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814847 // ldr z7, [x2, #10, MUL VL] + WORD $0x85814c48 // ldr z8, [x2, #11, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 1 to 6 outputs + WORD $0x85804089 // ldr z9, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85815047 // ldr z7, [x2, #12, MUL VL] + WORD $0x85815448 // ldr z8, [x2, #13, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85815847 // ldr z7, [x2, #14, MUL VL] + WORD $0x85815c48 // ldr z8, [x2, #15, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85824047 // ldr z7, [x2, #16, MUL VL] + WORD $0x85824448 // ldr z8, [x2, #17, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85824847 // ldr z7, [x2, #18, MUL VL] + WORD $0x85824c48 // ldr z8, [x2, #19, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85825047 // ldr z7, [x2, #20, MUL VL] + WORD $0x85825448 // ldr z8, [x2, #21, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85825847 // ldr z7, [x2, #22, MUL VL] + WORD $0x85825c48 // ldr z8, [x2, #23, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 2 to 6 outputs + WORD $0x858040a9 // ldr z9, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85834047 // ldr z7, [x2, #24, MUL VL] + WORD $0x85834448 // ldr z8, [x2, #25, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85834847 // ldr z7, [x2, #26, MUL VL] + WORD $0x85834c48 // ldr z8, [x2, #27, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85835047 // ldr z7, [x2, #28, MUL VL] + WORD $0x85835448 // ldr z8, [x2, #29, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85835847 // ldr z7, [x2, #30, MUL VL] + WORD $0x85835c48 // ldr z8, [x2, #31, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85844047 // ldr z7, [x2, #32, MUL VL] + WORD $0x85844448 // ldr z8, [x2, #33, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85844847 // ldr z7, [x2, #34, MUL VL] + WORD $0x85844c48 // ldr z8, [x2, #35, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 3 to 6 outputs + WORD $0x85804109 // ldr z9, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85845047 // ldr z7, [x2, #36, MUL VL] + WORD $0x85845448 // ldr z8, [x2, #37, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85845847 // ldr z7, [x2, #38, MUL VL] + WORD $0x85845c48 // ldr z8, [x2, #39, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85854047 // ldr z7, [x2, #40, MUL VL] + WORD $0x85854448 // ldr z8, [x2, #41, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85854847 // ldr z7, [x2, #42, MUL VL] + WORD $0x85854c48 // ldr z8, [x2, #43, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85855047 // ldr z7, [x2, #44, MUL VL] + WORD $0x85855448 // ldr z8, [x2, #45, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85855847 // ldr z7, [x2, #46, MUL VL] + WORD $0x85855c48 // ldr z8, [x2, #47, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 4 to 6 outputs + WORD $0x85804129 // ldr z9, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85864047 // ldr z7, [x2, #48, MUL VL] + WORD $0x85864448 // ldr z8, [x2, #49, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85864847 // ldr z7, [x2, #50, MUL VL] + WORD $0x85864c48 // ldr z8, [x2, #51, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85865047 // ldr z7, [x2, #52, MUL VL] + WORD $0x85865448 // ldr z8, [x2, #53, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85865847 // ldr z7, [x2, #54, MUL VL] + WORD $0x85865c48 // ldr z8, [x2, #55, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85874047 // ldr z7, [x2, #56, MUL VL] + WORD $0x85874448 // ldr z8, [x2, #57, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85874847 // ldr z7, [x2, #58, MUL VL] + WORD $0x85874c48 // ldr z8, [x2, #59, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 5 to 6 outputs + WORD $0x85804149 // ldr z9, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85875047 // ldr z7, [x2, #60, MUL VL] + WORD $0x85875448 // ldr z8, [x2, #61, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85875847 // ldr z7, [x2, #62, MUL VL] + WORD $0x85875c48 // ldr z8, [x2, #63, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85884047 // ldr z7, [x2, #64, MUL VL] + WORD $0x85884448 // ldr z8, [x2, #65, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85884847 // ldr z7, [x2, #66, MUL VL] + WORD $0x85884c48 // ldr z8, [x2, #67, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x85885047 // ldr z7, [x2, #68, MUL VL] + WORD $0x85885448 // ldr z8, [x2, #69, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x85885847 // ldr z7, [x2, #70, MUL VL] + WORD $0x85885c48 // ldr z8, [x2, #71, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 6 to 6 outputs + WORD $0x85804169 // ldr z9, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x85894047 // ldr z7, [x2, #72, MUL VL] + WORD $0x85894448 // ldr z8, [x2, #73, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x85894847 // ldr z7, [x2, #74, MUL VL] + WORD $0x85894c48 // ldr z8, [x2, #75, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x85895047 // ldr z7, [x2, #76, MUL VL] + WORD $0x85895448 // ldr z8, [x2, #77, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x85895847 // ldr z7, [x2, #78, MUL VL] + WORD $0x85895c48 // ldr z8, [x2, #79, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858a4047 // ldr z7, [x2, #80, MUL VL] + WORD $0x858a4448 // ldr z8, [x2, #81, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858a4847 // ldr z7, [x2, #82, MUL VL] + WORD $0x858a4c48 // ldr z8, [x2, #83, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 7 to 6 outputs + WORD $0x85804189 // ldr z9, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858a5047 // ldr z7, [x2, #84, MUL VL] + WORD $0x858a5448 // ldr z8, [x2, #85, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858a5847 // ldr z7, [x2, #86, MUL VL] + WORD $0x858a5c48 // ldr z8, [x2, #87, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858b4047 // ldr z7, [x2, #88, MUL VL] + WORD $0x858b4448 // ldr z8, [x2, #89, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858b4847 // ldr z7, [x2, #90, MUL VL] + WORD $0x858b4c48 // ldr z8, [x2, #91, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858b5047 // ldr z7, [x2, #92, MUL VL] + WORD $0x858b5448 // ldr z8, [x2, #93, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858b5847 // ldr z7, [x2, #94, MUL VL] + WORD $0x858b5c48 // ldr z8, [x2, #95, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 8 to 6 outputs + WORD $0x858041a9 // ldr z9, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858c4047 // ldr z7, [x2, #96, MUL VL] + WORD $0x858c4448 // ldr z8, [x2, #97, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858c4847 // ldr z7, [x2, #98, MUL VL] + WORD $0x858c4c48 // ldr z8, [x2, #99, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858c5047 // ldr z7, [x2, #100, MUL VL] + WORD $0x858c5448 // ldr z8, [x2, #101, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858c5847 // ldr z7, [x2, #102, MUL VL] + WORD $0x858c5c48 // ldr z8, [x2, #103, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858d4047 // ldr z7, [x2, #104, MUL VL] + WORD $0x858d4448 // ldr z8, [x2, #105, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858d4847 // ldr z7, [x2, #106, MUL VL] + WORD $0x858d4c48 // ldr z8, [x2, #107, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x6Xor_store + + // Load and process 32 bytes from input 9 to 6 outputs + WORD $0x85804069 // ldr z9, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc952a // lsr z10.d, z9.d, #4 + WORD $0x04263129 // and z9.d, z9.d, z6.d + WORD $0x0426314a // and z10.d, z10.d, z6.d + WORD $0x858d5047 // ldr z7, [x2, #108, MUL VL] + WORD $0x858d5448 // ldr z8, [x2, #109, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73000 // eor z0.d, z0.d, z7.d + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x858d5847 // ldr z7, [x2, #110, MUL VL] + WORD $0x858d5c48 // ldr z8, [x2, #111, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73021 // eor z1.d, z1.d, z7.d + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x858e4047 // ldr z7, [x2, #112, MUL VL] + WORD $0x858e4448 // ldr z8, [x2, #113, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73042 // eor z2.d, z2.d, z7.d + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x858e4847 // ldr z7, [x2, #114, MUL VL] + WORD $0x858e4c48 // ldr z8, [x2, #115, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73063 // eor z3.d, z3.d, z7.d + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x858e5047 // ldr z7, [x2, #116, MUL VL] + WORD $0x858e5448 // ldr z8, [x2, #117, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a73084 // eor z4.d, z4.d, z7.d + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x858e5847 // ldr z7, [x2, #118, MUL VL] + WORD $0x858e5c48 // ldr z8, [x2, #119, MUL VL] + WORD $0x052930e7 // tbl z7.b, z7.b, z9.b + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x04a730a5 // eor z5.d, z5.d, z7.d + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + +mulSve_10x6Xor_store: + // Store 6 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x6Xor_loop + +mulSve_10x6Xor_end: + RET + +// func mulSve_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x7_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c7 // mov z7.d, x6 + WORD $0x052120e7 // dup z7.b, z7.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + WORD $0x8580402a // ldr z10, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85804048 // ldr z8, [x2] + WORD $0x85804449 // ldr z9, [x2, #1, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83120 // eor z0.d, z9.d, z8.d + WORD $0x85804848 // ldr z8, [x2, #2, MUL VL] + WORD $0x85804c49 // ldr z9, [x2, #3, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83121 // eor z1.d, z9.d, z8.d + WORD $0x85805048 // ldr z8, [x2, #4, MUL VL] + WORD $0x85805449 // ldr z9, [x2, #5, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83122 // eor z2.d, z9.d, z8.d + WORD $0x85805848 // ldr z8, [x2, #6, MUL VL] + WORD $0x85805c49 // ldr z9, [x2, #7, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83123 // eor z3.d, z9.d, z8.d + WORD $0x85814048 // ldr z8, [x2, #8, MUL VL] + WORD $0x85814449 // ldr z9, [x2, #9, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83124 // eor z4.d, z9.d, z8.d + WORD $0x85814848 // ldr z8, [x2, #10, MUL VL] + WORD $0x85814c49 // ldr z9, [x2, #11, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83125 // eor z5.d, z9.d, z8.d + WORD $0x85815048 // ldr z8, [x2, #12, MUL VL] + WORD $0x85815449 // ldr z9, [x2, #13, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83126 // eor z6.d, z9.d, z8.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 1 to 7 outputs + WORD $0x8580408a // ldr z10, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85815848 // ldr z8, [x2, #14, MUL VL] + WORD $0x85815c49 // ldr z9, [x2, #15, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85824048 // ldr z8, [x2, #16, MUL VL] + WORD $0x85824449 // ldr z9, [x2, #17, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85824848 // ldr z8, [x2, #18, MUL VL] + WORD $0x85824c49 // ldr z9, [x2, #19, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85825048 // ldr z8, [x2, #20, MUL VL] + WORD $0x85825449 // ldr z9, [x2, #21, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85825848 // ldr z8, [x2, #22, MUL VL] + WORD $0x85825c49 // ldr z9, [x2, #23, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85834048 // ldr z8, [x2, #24, MUL VL] + WORD $0x85834449 // ldr z9, [x2, #25, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85834848 // ldr z8, [x2, #26, MUL VL] + WORD $0x85834c49 // ldr z9, [x2, #27, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 2 to 7 outputs + WORD $0x858040aa // ldr z10, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85835048 // ldr z8, [x2, #28, MUL VL] + WORD $0x85835449 // ldr z9, [x2, #29, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85835848 // ldr z8, [x2, #30, MUL VL] + WORD $0x85835c49 // ldr z9, [x2, #31, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85844048 // ldr z8, [x2, #32, MUL VL] + WORD $0x85844449 // ldr z9, [x2, #33, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85844848 // ldr z8, [x2, #34, MUL VL] + WORD $0x85844c49 // ldr z9, [x2, #35, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85845048 // ldr z8, [x2, #36, MUL VL] + WORD $0x85845449 // ldr z9, [x2, #37, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85845848 // ldr z8, [x2, #38, MUL VL] + WORD $0x85845c49 // ldr z9, [x2, #39, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85854048 // ldr z8, [x2, #40, MUL VL] + WORD $0x85854449 // ldr z9, [x2, #41, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 3 to 7 outputs + WORD $0x8580410a // ldr z10, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85854848 // ldr z8, [x2, #42, MUL VL] + WORD $0x85854c49 // ldr z9, [x2, #43, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85855048 // ldr z8, [x2, #44, MUL VL] + WORD $0x85855449 // ldr z9, [x2, #45, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85855848 // ldr z8, [x2, #46, MUL VL] + WORD $0x85855c49 // ldr z9, [x2, #47, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85864048 // ldr z8, [x2, #48, MUL VL] + WORD $0x85864449 // ldr z9, [x2, #49, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85864848 // ldr z8, [x2, #50, MUL VL] + WORD $0x85864c49 // ldr z9, [x2, #51, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85865048 // ldr z8, [x2, #52, MUL VL] + WORD $0x85865449 // ldr z9, [x2, #53, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85865848 // ldr z8, [x2, #54, MUL VL] + WORD $0x85865c49 // ldr z9, [x2, #55, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 4 to 7 outputs + WORD $0x8580412a // ldr z10, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85874048 // ldr z8, [x2, #56, MUL VL] + WORD $0x85874449 // ldr z9, [x2, #57, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85874848 // ldr z8, [x2, #58, MUL VL] + WORD $0x85874c49 // ldr z9, [x2, #59, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85875048 // ldr z8, [x2, #60, MUL VL] + WORD $0x85875449 // ldr z9, [x2, #61, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85875848 // ldr z8, [x2, #62, MUL VL] + WORD $0x85875c49 // ldr z9, [x2, #63, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85884048 // ldr z8, [x2, #64, MUL VL] + WORD $0x85884449 // ldr z9, [x2, #65, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85884848 // ldr z8, [x2, #66, MUL VL] + WORD $0x85884c49 // ldr z9, [x2, #67, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85885048 // ldr z8, [x2, #68, MUL VL] + WORD $0x85885449 // ldr z9, [x2, #69, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 5 to 7 outputs + WORD $0x8580414a // ldr z10, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85885848 // ldr z8, [x2, #70, MUL VL] + WORD $0x85885c49 // ldr z9, [x2, #71, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85894048 // ldr z8, [x2, #72, MUL VL] + WORD $0x85894449 // ldr z9, [x2, #73, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85894848 // ldr z8, [x2, #74, MUL VL] + WORD $0x85894c49 // ldr z9, [x2, #75, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85895048 // ldr z8, [x2, #76, MUL VL] + WORD $0x85895449 // ldr z9, [x2, #77, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85895848 // ldr z8, [x2, #78, MUL VL] + WORD $0x85895c49 // ldr z9, [x2, #79, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858a4048 // ldr z8, [x2, #80, MUL VL] + WORD $0x858a4449 // ldr z9, [x2, #81, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858a4848 // ldr z8, [x2, #82, MUL VL] + WORD $0x858a4c49 // ldr z9, [x2, #83, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 6 to 7 outputs + WORD $0x8580416a // ldr z10, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858a5048 // ldr z8, [x2, #84, MUL VL] + WORD $0x858a5449 // ldr z9, [x2, #85, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858a5848 // ldr z8, [x2, #86, MUL VL] + WORD $0x858a5c49 // ldr z9, [x2, #87, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858b4048 // ldr z8, [x2, #88, MUL VL] + WORD $0x858b4449 // ldr z9, [x2, #89, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858b4848 // ldr z8, [x2, #90, MUL VL] + WORD $0x858b4c49 // ldr z9, [x2, #91, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858b5048 // ldr z8, [x2, #92, MUL VL] + WORD $0x858b5449 // ldr z9, [x2, #93, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858b5848 // ldr z8, [x2, #94, MUL VL] + WORD $0x858b5c49 // ldr z9, [x2, #95, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858c4048 // ldr z8, [x2, #96, MUL VL] + WORD $0x858c4449 // ldr z9, [x2, #97, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 7 to 7 outputs + WORD $0x8580418a // ldr z10, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858c4848 // ldr z8, [x2, #98, MUL VL] + WORD $0x858c4c49 // ldr z9, [x2, #99, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858c5048 // ldr z8, [x2, #100, MUL VL] + WORD $0x858c5449 // ldr z9, [x2, #101, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858c5848 // ldr z8, [x2, #102, MUL VL] + WORD $0x858c5c49 // ldr z9, [x2, #103, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858d4048 // ldr z8, [x2, #104, MUL VL] + WORD $0x858d4449 // ldr z9, [x2, #105, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858d4848 // ldr z8, [x2, #106, MUL VL] + WORD $0x858d4c49 // ldr z9, [x2, #107, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858d5048 // ldr z8, [x2, #108, MUL VL] + WORD $0x858d5449 // ldr z9, [x2, #109, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858d5848 // ldr z8, [x2, #110, MUL VL] + WORD $0x858d5c49 // ldr z9, [x2, #111, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 8 to 7 outputs + WORD $0x858041aa // ldr z10, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858e4048 // ldr z8, [x2, #112, MUL VL] + WORD $0x858e4449 // ldr z9, [x2, #113, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858e4848 // ldr z8, [x2, #114, MUL VL] + WORD $0x858e4c49 // ldr z9, [x2, #115, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858e5048 // ldr z8, [x2, #116, MUL VL] + WORD $0x858e5449 // ldr z9, [x2, #117, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858e5848 // ldr z8, [x2, #118, MUL VL] + WORD $0x858e5c49 // ldr z9, [x2, #119, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858f4048 // ldr z8, [x2, #120, MUL VL] + WORD $0x858f4449 // ldr z9, [x2, #121, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858f4848 // ldr z8, [x2, #122, MUL VL] + WORD $0x858f4c49 // ldr z9, [x2, #123, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858f5048 // ldr z8, [x2, #124, MUL VL] + WORD $0x858f5449 // ldr z9, [x2, #125, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x7_store + + // Load and process 32 bytes from input 9 to 7 outputs + WORD $0x8580406a // ldr z10, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858f5848 // ldr z8, [x2, #126, MUL VL] + WORD $0x858f5c49 // ldr z9, [x2, #127, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85904048 // ldr z8, [x2, #128, MUL VL] + WORD $0x85904449 // ldr z9, [x2, #129, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85904848 // ldr z8, [x2, #130, MUL VL] + WORD $0x85904c49 // ldr z9, [x2, #131, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85905048 // ldr z8, [x2, #132, MUL VL] + WORD $0x85905449 // ldr z9, [x2, #133, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85905848 // ldr z8, [x2, #134, MUL VL] + WORD $0x85905c49 // ldr z9, [x2, #135, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85914048 // ldr z8, [x2, #136, MUL VL] + WORD $0x85914449 // ldr z9, [x2, #137, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85914848 // ldr z8, [x2, #138, MUL VL] + WORD $0x85914c49 // ldr z9, [x2, #139, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + +mulSve_10x7_store: + // Store 7 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x7_loop + +mulSve_10x7_end: + RET + +// func mulSve_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x7Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c7 // mov z7.d, x6 + WORD $0x052120e7 // dup z7.b, z7.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + WORD $0x8580402a // ldr z10, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804048 // ldr z8, [x2] + WORD $0x85804449 // ldr z9, [x2, #1, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804848 // ldr z8, [x2, #2, MUL VL] + WORD $0x85804c49 // ldr z9, [x2, #3, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805048 // ldr z8, [x2, #4, MUL VL] + WORD $0x85805449 // ldr z9, [x2, #5, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805848 // ldr z8, [x2, #6, MUL VL] + WORD $0x85805c49 // ldr z9, [x2, #7, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814048 // ldr z8, [x2, #8, MUL VL] + WORD $0x85814449 // ldr z9, [x2, #9, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814848 // ldr z8, [x2, #10, MUL VL] + WORD $0x85814c49 // ldr z9, [x2, #11, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815048 // ldr z8, [x2, #12, MUL VL] + WORD $0x85815449 // ldr z9, [x2, #13, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 1 to 7 outputs + WORD $0x8580408a // ldr z10, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85815848 // ldr z8, [x2, #14, MUL VL] + WORD $0x85815c49 // ldr z9, [x2, #15, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85824048 // ldr z8, [x2, #16, MUL VL] + WORD $0x85824449 // ldr z9, [x2, #17, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85824848 // ldr z8, [x2, #18, MUL VL] + WORD $0x85824c49 // ldr z9, [x2, #19, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85825048 // ldr z8, [x2, #20, MUL VL] + WORD $0x85825449 // ldr z9, [x2, #21, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85825848 // ldr z8, [x2, #22, MUL VL] + WORD $0x85825c49 // ldr z9, [x2, #23, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85834048 // ldr z8, [x2, #24, MUL VL] + WORD $0x85834449 // ldr z9, [x2, #25, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85834848 // ldr z8, [x2, #26, MUL VL] + WORD $0x85834c49 // ldr z9, [x2, #27, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 2 to 7 outputs + WORD $0x858040aa // ldr z10, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85835048 // ldr z8, [x2, #28, MUL VL] + WORD $0x85835449 // ldr z9, [x2, #29, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85835848 // ldr z8, [x2, #30, MUL VL] + WORD $0x85835c49 // ldr z9, [x2, #31, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85844048 // ldr z8, [x2, #32, MUL VL] + WORD $0x85844449 // ldr z9, [x2, #33, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85844848 // ldr z8, [x2, #34, MUL VL] + WORD $0x85844c49 // ldr z9, [x2, #35, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85845048 // ldr z8, [x2, #36, MUL VL] + WORD $0x85845449 // ldr z9, [x2, #37, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85845848 // ldr z8, [x2, #38, MUL VL] + WORD $0x85845c49 // ldr z9, [x2, #39, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85854048 // ldr z8, [x2, #40, MUL VL] + WORD $0x85854449 // ldr z9, [x2, #41, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 3 to 7 outputs + WORD $0x8580410a // ldr z10, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85854848 // ldr z8, [x2, #42, MUL VL] + WORD $0x85854c49 // ldr z9, [x2, #43, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85855048 // ldr z8, [x2, #44, MUL VL] + WORD $0x85855449 // ldr z9, [x2, #45, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85855848 // ldr z8, [x2, #46, MUL VL] + WORD $0x85855c49 // ldr z9, [x2, #47, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85864048 // ldr z8, [x2, #48, MUL VL] + WORD $0x85864449 // ldr z9, [x2, #49, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85864848 // ldr z8, [x2, #50, MUL VL] + WORD $0x85864c49 // ldr z9, [x2, #51, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85865048 // ldr z8, [x2, #52, MUL VL] + WORD $0x85865449 // ldr z9, [x2, #53, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85865848 // ldr z8, [x2, #54, MUL VL] + WORD $0x85865c49 // ldr z9, [x2, #55, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 4 to 7 outputs + WORD $0x8580412a // ldr z10, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85874048 // ldr z8, [x2, #56, MUL VL] + WORD $0x85874449 // ldr z9, [x2, #57, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85874848 // ldr z8, [x2, #58, MUL VL] + WORD $0x85874c49 // ldr z9, [x2, #59, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85875048 // ldr z8, [x2, #60, MUL VL] + WORD $0x85875449 // ldr z9, [x2, #61, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85875848 // ldr z8, [x2, #62, MUL VL] + WORD $0x85875c49 // ldr z9, [x2, #63, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85884048 // ldr z8, [x2, #64, MUL VL] + WORD $0x85884449 // ldr z9, [x2, #65, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85884848 // ldr z8, [x2, #66, MUL VL] + WORD $0x85884c49 // ldr z9, [x2, #67, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85885048 // ldr z8, [x2, #68, MUL VL] + WORD $0x85885449 // ldr z9, [x2, #69, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 5 to 7 outputs + WORD $0x8580414a // ldr z10, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x85885848 // ldr z8, [x2, #70, MUL VL] + WORD $0x85885c49 // ldr z9, [x2, #71, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85894048 // ldr z8, [x2, #72, MUL VL] + WORD $0x85894449 // ldr z9, [x2, #73, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85894848 // ldr z8, [x2, #74, MUL VL] + WORD $0x85894c49 // ldr z9, [x2, #75, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85895048 // ldr z8, [x2, #76, MUL VL] + WORD $0x85895449 // ldr z9, [x2, #77, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85895848 // ldr z8, [x2, #78, MUL VL] + WORD $0x85895c49 // ldr z9, [x2, #79, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858a4048 // ldr z8, [x2, #80, MUL VL] + WORD $0x858a4449 // ldr z9, [x2, #81, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858a4848 // ldr z8, [x2, #82, MUL VL] + WORD $0x858a4c49 // ldr z9, [x2, #83, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 6 to 7 outputs + WORD $0x8580416a // ldr z10, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858a5048 // ldr z8, [x2, #84, MUL VL] + WORD $0x858a5449 // ldr z9, [x2, #85, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858a5848 // ldr z8, [x2, #86, MUL VL] + WORD $0x858a5c49 // ldr z9, [x2, #87, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858b4048 // ldr z8, [x2, #88, MUL VL] + WORD $0x858b4449 // ldr z9, [x2, #89, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858b4848 // ldr z8, [x2, #90, MUL VL] + WORD $0x858b4c49 // ldr z9, [x2, #91, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858b5048 // ldr z8, [x2, #92, MUL VL] + WORD $0x858b5449 // ldr z9, [x2, #93, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858b5848 // ldr z8, [x2, #94, MUL VL] + WORD $0x858b5c49 // ldr z9, [x2, #95, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858c4048 // ldr z8, [x2, #96, MUL VL] + WORD $0x858c4449 // ldr z9, [x2, #97, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 7 to 7 outputs + WORD $0x8580418a // ldr z10, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858c4848 // ldr z8, [x2, #98, MUL VL] + WORD $0x858c4c49 // ldr z9, [x2, #99, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858c5048 // ldr z8, [x2, #100, MUL VL] + WORD $0x858c5449 // ldr z9, [x2, #101, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858c5848 // ldr z8, [x2, #102, MUL VL] + WORD $0x858c5c49 // ldr z9, [x2, #103, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858d4048 // ldr z8, [x2, #104, MUL VL] + WORD $0x858d4449 // ldr z9, [x2, #105, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858d4848 // ldr z8, [x2, #106, MUL VL] + WORD $0x858d4c49 // ldr z9, [x2, #107, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858d5048 // ldr z8, [x2, #108, MUL VL] + WORD $0x858d5449 // ldr z9, [x2, #109, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858d5848 // ldr z8, [x2, #110, MUL VL] + WORD $0x858d5c49 // ldr z9, [x2, #111, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 8 to 7 outputs + WORD $0x858041aa // ldr z10, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858e4048 // ldr z8, [x2, #112, MUL VL] + WORD $0x858e4449 // ldr z9, [x2, #113, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x858e4848 // ldr z8, [x2, #114, MUL VL] + WORD $0x858e4c49 // ldr z9, [x2, #115, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x858e5048 // ldr z8, [x2, #116, MUL VL] + WORD $0x858e5449 // ldr z9, [x2, #117, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x858e5848 // ldr z8, [x2, #118, MUL VL] + WORD $0x858e5c49 // ldr z9, [x2, #119, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x858f4048 // ldr z8, [x2, #120, MUL VL] + WORD $0x858f4449 // ldr z9, [x2, #121, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x858f4848 // ldr z8, [x2, #122, MUL VL] + WORD $0x858f4c49 // ldr z9, [x2, #123, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x858f5048 // ldr z8, [x2, #124, MUL VL] + WORD $0x858f5449 // ldr z9, [x2, #125, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x7Xor_store + + // Load and process 32 bytes from input 9 to 7 outputs + WORD $0x8580406a // ldr z10, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc954b // lsr z11.d, z10.d, #4 + WORD $0x0427314a // and z10.d, z10.d, z7.d + WORD $0x0427316b // and z11.d, z11.d, z7.d + WORD $0x858f5848 // ldr z8, [x2, #126, MUL VL] + WORD $0x858f5c49 // ldr z9, [x2, #127, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83000 // eor z0.d, z0.d, z8.d + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x85904048 // ldr z8, [x2, #128, MUL VL] + WORD $0x85904449 // ldr z9, [x2, #129, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83021 // eor z1.d, z1.d, z8.d + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x85904848 // ldr z8, [x2, #130, MUL VL] + WORD $0x85904c49 // ldr z9, [x2, #131, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83042 // eor z2.d, z2.d, z8.d + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x85905048 // ldr z8, [x2, #132, MUL VL] + WORD $0x85905449 // ldr z9, [x2, #133, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83063 // eor z3.d, z3.d, z8.d + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x85905848 // ldr z8, [x2, #134, MUL VL] + WORD $0x85905c49 // ldr z9, [x2, #135, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a83084 // eor z4.d, z4.d, z8.d + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x85914048 // ldr z8, [x2, #136, MUL VL] + WORD $0x85914449 // ldr z9, [x2, #137, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830a5 // eor z5.d, z5.d, z8.d + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x85914848 // ldr z8, [x2, #138, MUL VL] + WORD $0x85914c49 // ldr z9, [x2, #139, MUL VL] + WORD $0x052a3108 // tbl z8.b, z8.b, z10.b + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x04a830c6 // eor z6.d, z6.d, z8.d + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + +mulSve_10x7Xor_store: + // Store 7 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x7Xor_loop + +mulSve_10x7Xor_end: + RET + +// func mulSve_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x8_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c8 // mov z8.d, x6 + WORD $0x05212108 // dup z8.b, z8.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85804049 // ldr z9, [x2] + WORD $0x8580444a // ldr z10, [x2, #1, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93140 // eor z0.d, z10.d, z9.d + WORD $0x85804849 // ldr z9, [x2, #2, MUL VL] + WORD $0x85804c4a // ldr z10, [x2, #3, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93141 // eor z1.d, z10.d, z9.d + WORD $0x85805049 // ldr z9, [x2, #4, MUL VL] + WORD $0x8580544a // ldr z10, [x2, #5, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93142 // eor z2.d, z10.d, z9.d + WORD $0x85805849 // ldr z9, [x2, #6, MUL VL] + WORD $0x85805c4a // ldr z10, [x2, #7, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93143 // eor z3.d, z10.d, z9.d + WORD $0x85814049 // ldr z9, [x2, #8, MUL VL] + WORD $0x8581444a // ldr z10, [x2, #9, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93144 // eor z4.d, z10.d, z9.d + WORD $0x85814849 // ldr z9, [x2, #10, MUL VL] + WORD $0x85814c4a // ldr z10, [x2, #11, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93145 // eor z5.d, z10.d, z9.d + WORD $0x85815049 // ldr z9, [x2, #12, MUL VL] + WORD $0x8581544a // ldr z10, [x2, #13, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93146 // eor z6.d, z10.d, z9.d + WORD $0x85815849 // ldr z9, [x2, #14, MUL VL] + WORD $0x85815c4a // ldr z10, [x2, #15, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93147 // eor z7.d, z10.d, z9.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 1 to 8 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85824049 // ldr z9, [x2, #16, MUL VL] + WORD $0x8582444a // ldr z10, [x2, #17, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85824849 // ldr z9, [x2, #18, MUL VL] + WORD $0x85824c4a // ldr z10, [x2, #19, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825049 // ldr z9, [x2, #20, MUL VL] + WORD $0x8582544a // ldr z10, [x2, #21, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85825849 // ldr z9, [x2, #22, MUL VL] + WORD $0x85825c4a // ldr z10, [x2, #23, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85834049 // ldr z9, [x2, #24, MUL VL] + WORD $0x8583444a // ldr z10, [x2, #25, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85834849 // ldr z9, [x2, #26, MUL VL] + WORD $0x85834c4a // ldr z10, [x2, #27, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85835049 // ldr z9, [x2, #28, MUL VL] + WORD $0x8583544a // ldr z10, [x2, #29, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85835849 // ldr z9, [x2, #30, MUL VL] + WORD $0x85835c4a // ldr z10, [x2, #31, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 2 to 8 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85844049 // ldr z9, [x2, #32, MUL VL] + WORD $0x8584444a // ldr z10, [x2, #33, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85844849 // ldr z9, [x2, #34, MUL VL] + WORD $0x85844c4a // ldr z10, [x2, #35, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845049 // ldr z9, [x2, #36, MUL VL] + WORD $0x8584544a // ldr z10, [x2, #37, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85845849 // ldr z9, [x2, #38, MUL VL] + WORD $0x85845c4a // ldr z10, [x2, #39, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854049 // ldr z9, [x2, #40, MUL VL] + WORD $0x8585444a // ldr z10, [x2, #41, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85854849 // ldr z9, [x2, #42, MUL VL] + WORD $0x85854c4a // ldr z10, [x2, #43, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85855049 // ldr z9, [x2, #44, MUL VL] + WORD $0x8585544a // ldr z10, [x2, #45, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85855849 // ldr z9, [x2, #46, MUL VL] + WORD $0x85855c4a // ldr z10, [x2, #47, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 3 to 8 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85864049 // ldr z9, [x2, #48, MUL VL] + WORD $0x8586444a // ldr z10, [x2, #49, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85864849 // ldr z9, [x2, #50, MUL VL] + WORD $0x85864c4a // ldr z10, [x2, #51, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85865049 // ldr z9, [x2, #52, MUL VL] + WORD $0x8586544a // ldr z10, [x2, #53, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85865849 // ldr z9, [x2, #54, MUL VL] + WORD $0x85865c4a // ldr z10, [x2, #55, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874049 // ldr z9, [x2, #56, MUL VL] + WORD $0x8587444a // ldr z10, [x2, #57, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85874849 // ldr z9, [x2, #58, MUL VL] + WORD $0x85874c4a // ldr z10, [x2, #59, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85875049 // ldr z9, [x2, #60, MUL VL] + WORD $0x8587544a // ldr z10, [x2, #61, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85875849 // ldr z9, [x2, #62, MUL VL] + WORD $0x85875c4a // ldr z10, [x2, #63, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 4 to 8 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85884049 // ldr z9, [x2, #64, MUL VL] + WORD $0x8588444a // ldr z10, [x2, #65, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85884849 // ldr z9, [x2, #66, MUL VL] + WORD $0x85884c4a // ldr z10, [x2, #67, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85885049 // ldr z9, [x2, #68, MUL VL] + WORD $0x8588544a // ldr z10, [x2, #69, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85885849 // ldr z9, [x2, #70, MUL VL] + WORD $0x85885c4a // ldr z10, [x2, #71, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85894049 // ldr z9, [x2, #72, MUL VL] + WORD $0x8589444a // ldr z10, [x2, #73, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85894849 // ldr z9, [x2, #74, MUL VL] + WORD $0x85894c4a // ldr z10, [x2, #75, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85895049 // ldr z9, [x2, #76, MUL VL] + WORD $0x8589544a // ldr z10, [x2, #77, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85895849 // ldr z9, [x2, #78, MUL VL] + WORD $0x85895c4a // ldr z10, [x2, #79, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 5 to 8 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858a4049 // ldr z9, [x2, #80, MUL VL] + WORD $0x858a444a // ldr z10, [x2, #81, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858a4849 // ldr z9, [x2, #82, MUL VL] + WORD $0x858a4c4a // ldr z10, [x2, #83, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858a5049 // ldr z9, [x2, #84, MUL VL] + WORD $0x858a544a // ldr z10, [x2, #85, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858a5849 // ldr z9, [x2, #86, MUL VL] + WORD $0x858a5c4a // ldr z10, [x2, #87, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858b4049 // ldr z9, [x2, #88, MUL VL] + WORD $0x858b444a // ldr z10, [x2, #89, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858b4849 // ldr z9, [x2, #90, MUL VL] + WORD $0x858b4c4a // ldr z10, [x2, #91, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858b5049 // ldr z9, [x2, #92, MUL VL] + WORD $0x858b544a // ldr z10, [x2, #93, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858b5849 // ldr z9, [x2, #94, MUL VL] + WORD $0x858b5c4a // ldr z10, [x2, #95, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 6 to 8 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858c4049 // ldr z9, [x2, #96, MUL VL] + WORD $0x858c444a // ldr z10, [x2, #97, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858c4849 // ldr z9, [x2, #98, MUL VL] + WORD $0x858c4c4a // ldr z10, [x2, #99, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858c5049 // ldr z9, [x2, #100, MUL VL] + WORD $0x858c544a // ldr z10, [x2, #101, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858c5849 // ldr z9, [x2, #102, MUL VL] + WORD $0x858c5c4a // ldr z10, [x2, #103, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858d4049 // ldr z9, [x2, #104, MUL VL] + WORD $0x858d444a // ldr z10, [x2, #105, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858d4849 // ldr z9, [x2, #106, MUL VL] + WORD $0x858d4c4a // ldr z10, [x2, #107, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858d5049 // ldr z9, [x2, #108, MUL VL] + WORD $0x858d544a // ldr z10, [x2, #109, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858d5849 // ldr z9, [x2, #110, MUL VL] + WORD $0x858d5c4a // ldr z10, [x2, #111, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 7 to 8 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858e4049 // ldr z9, [x2, #112, MUL VL] + WORD $0x858e444a // ldr z10, [x2, #113, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858e4849 // ldr z9, [x2, #114, MUL VL] + WORD $0x858e4c4a // ldr z10, [x2, #115, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858e5049 // ldr z9, [x2, #116, MUL VL] + WORD $0x858e544a // ldr z10, [x2, #117, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858e5849 // ldr z9, [x2, #118, MUL VL] + WORD $0x858e5c4a // ldr z10, [x2, #119, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858f4049 // ldr z9, [x2, #120, MUL VL] + WORD $0x858f444a // ldr z10, [x2, #121, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858f4849 // ldr z9, [x2, #122, MUL VL] + WORD $0x858f4c4a // ldr z10, [x2, #123, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858f5049 // ldr z9, [x2, #124, MUL VL] + WORD $0x858f544a // ldr z10, [x2, #125, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858f5849 // ldr z9, [x2, #126, MUL VL] + WORD $0x858f5c4a // ldr z10, [x2, #127, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 8 to 8 outputs + WORD $0x858041ab // ldr z11, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85904049 // ldr z9, [x2, #128, MUL VL] + WORD $0x8590444a // ldr z10, [x2, #129, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85904849 // ldr z9, [x2, #130, MUL VL] + WORD $0x85904c4a // ldr z10, [x2, #131, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85905049 // ldr z9, [x2, #132, MUL VL] + WORD $0x8590544a // ldr z10, [x2, #133, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85905849 // ldr z9, [x2, #134, MUL VL] + WORD $0x85905c4a // ldr z10, [x2, #135, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85914049 // ldr z9, [x2, #136, MUL VL] + WORD $0x8591444a // ldr z10, [x2, #137, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85914849 // ldr z9, [x2, #138, MUL VL] + WORD $0x85914c4a // ldr z10, [x2, #139, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85915049 // ldr z9, [x2, #140, MUL VL] + WORD $0x8591544a // ldr z10, [x2, #141, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85915849 // ldr z9, [x2, #142, MUL VL] + WORD $0x85915c4a // ldr z10, [x2, #143, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x8_store + + // Load and process 32 bytes from input 9 to 8 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85924049 // ldr z9, [x2, #144, MUL VL] + WORD $0x8592444a // ldr z10, [x2, #145, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85924849 // ldr z9, [x2, #146, MUL VL] + WORD $0x85924c4a // ldr z10, [x2, #147, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85925049 // ldr z9, [x2, #148, MUL VL] + WORD $0x8592544a // ldr z10, [x2, #149, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85925849 // ldr z9, [x2, #150, MUL VL] + WORD $0x85925c4a // ldr z10, [x2, #151, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85934049 // ldr z9, [x2, #152, MUL VL] + WORD $0x8593444a // ldr z10, [x2, #153, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85934849 // ldr z9, [x2, #154, MUL VL] + WORD $0x85934c4a // ldr z10, [x2, #155, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85935049 // ldr z9, [x2, #156, MUL VL] + WORD $0x8593544a // ldr z10, [x2, #157, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85935849 // ldr z9, [x2, #158, MUL VL] + WORD $0x85935c4a // ldr z10, [x2, #159, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + +mulSve_10x8_store: + // Store 8 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x8_loop + +mulSve_10x8_end: + RET + +// func mulSve_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x8Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c8 // mov z8.d, x6 + WORD $0x05212108 // dup z8.b, z8.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + WORD $0x8580402b // ldr z11, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804049 // ldr z9, [x2] + WORD $0x8580444a // ldr z10, [x2, #1, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85804849 // ldr z9, [x2, #2, MUL VL] + WORD $0x85804c4a // ldr z10, [x2, #3, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805049 // ldr z9, [x2, #4, MUL VL] + WORD $0x8580544a // ldr z10, [x2, #5, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85805849 // ldr z9, [x2, #6, MUL VL] + WORD $0x85805c4a // ldr z10, [x2, #7, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814049 // ldr z9, [x2, #8, MUL VL] + WORD $0x8581444a // ldr z10, [x2, #9, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85814849 // ldr z9, [x2, #10, MUL VL] + WORD $0x85814c4a // ldr z10, [x2, #11, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815049 // ldr z9, [x2, #12, MUL VL] + WORD $0x8581544a // ldr z10, [x2, #13, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x85815849 // ldr z9, [x2, #14, MUL VL] + WORD $0x85815c4a // ldr z10, [x2, #15, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 1 to 8 outputs + WORD $0x8580408b // ldr z11, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85824049 // ldr z9, [x2, #16, MUL VL] + WORD $0x8582444a // ldr z10, [x2, #17, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85824849 // ldr z9, [x2, #18, MUL VL] + WORD $0x85824c4a // ldr z10, [x2, #19, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85825049 // ldr z9, [x2, #20, MUL VL] + WORD $0x8582544a // ldr z10, [x2, #21, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85825849 // ldr z9, [x2, #22, MUL VL] + WORD $0x85825c4a // ldr z10, [x2, #23, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85834049 // ldr z9, [x2, #24, MUL VL] + WORD $0x8583444a // ldr z10, [x2, #25, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85834849 // ldr z9, [x2, #26, MUL VL] + WORD $0x85834c4a // ldr z10, [x2, #27, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85835049 // ldr z9, [x2, #28, MUL VL] + WORD $0x8583544a // ldr z10, [x2, #29, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85835849 // ldr z9, [x2, #30, MUL VL] + WORD $0x85835c4a // ldr z10, [x2, #31, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 2 to 8 outputs + WORD $0x858040ab // ldr z11, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85844049 // ldr z9, [x2, #32, MUL VL] + WORD $0x8584444a // ldr z10, [x2, #33, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85844849 // ldr z9, [x2, #34, MUL VL] + WORD $0x85844c4a // ldr z10, [x2, #35, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85845049 // ldr z9, [x2, #36, MUL VL] + WORD $0x8584544a // ldr z10, [x2, #37, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85845849 // ldr z9, [x2, #38, MUL VL] + WORD $0x85845c4a // ldr z10, [x2, #39, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85854049 // ldr z9, [x2, #40, MUL VL] + WORD $0x8585444a // ldr z10, [x2, #41, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85854849 // ldr z9, [x2, #42, MUL VL] + WORD $0x85854c4a // ldr z10, [x2, #43, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85855049 // ldr z9, [x2, #44, MUL VL] + WORD $0x8585544a // ldr z10, [x2, #45, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85855849 // ldr z9, [x2, #46, MUL VL] + WORD $0x85855c4a // ldr z10, [x2, #47, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 3 to 8 outputs + WORD $0x8580410b // ldr z11, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85864049 // ldr z9, [x2, #48, MUL VL] + WORD $0x8586444a // ldr z10, [x2, #49, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85864849 // ldr z9, [x2, #50, MUL VL] + WORD $0x85864c4a // ldr z10, [x2, #51, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85865049 // ldr z9, [x2, #52, MUL VL] + WORD $0x8586544a // ldr z10, [x2, #53, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85865849 // ldr z9, [x2, #54, MUL VL] + WORD $0x85865c4a // ldr z10, [x2, #55, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85874049 // ldr z9, [x2, #56, MUL VL] + WORD $0x8587444a // ldr z10, [x2, #57, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85874849 // ldr z9, [x2, #58, MUL VL] + WORD $0x85874c4a // ldr z10, [x2, #59, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85875049 // ldr z9, [x2, #60, MUL VL] + WORD $0x8587544a // ldr z10, [x2, #61, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85875849 // ldr z9, [x2, #62, MUL VL] + WORD $0x85875c4a // ldr z10, [x2, #63, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 4 to 8 outputs + WORD $0x8580412b // ldr z11, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85884049 // ldr z9, [x2, #64, MUL VL] + WORD $0x8588444a // ldr z10, [x2, #65, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85884849 // ldr z9, [x2, #66, MUL VL] + WORD $0x85884c4a // ldr z10, [x2, #67, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85885049 // ldr z9, [x2, #68, MUL VL] + WORD $0x8588544a // ldr z10, [x2, #69, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85885849 // ldr z9, [x2, #70, MUL VL] + WORD $0x85885c4a // ldr z10, [x2, #71, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85894049 // ldr z9, [x2, #72, MUL VL] + WORD $0x8589444a // ldr z10, [x2, #73, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85894849 // ldr z9, [x2, #74, MUL VL] + WORD $0x85894c4a // ldr z10, [x2, #75, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85895049 // ldr z9, [x2, #76, MUL VL] + WORD $0x8589544a // ldr z10, [x2, #77, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85895849 // ldr z9, [x2, #78, MUL VL] + WORD $0x85895c4a // ldr z10, [x2, #79, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 5 to 8 outputs + WORD $0x8580414b // ldr z11, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858a4049 // ldr z9, [x2, #80, MUL VL] + WORD $0x858a444a // ldr z10, [x2, #81, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858a4849 // ldr z9, [x2, #82, MUL VL] + WORD $0x858a4c4a // ldr z10, [x2, #83, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858a5049 // ldr z9, [x2, #84, MUL VL] + WORD $0x858a544a // ldr z10, [x2, #85, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858a5849 // ldr z9, [x2, #86, MUL VL] + WORD $0x858a5c4a // ldr z10, [x2, #87, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858b4049 // ldr z9, [x2, #88, MUL VL] + WORD $0x858b444a // ldr z10, [x2, #89, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858b4849 // ldr z9, [x2, #90, MUL VL] + WORD $0x858b4c4a // ldr z10, [x2, #91, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858b5049 // ldr z9, [x2, #92, MUL VL] + WORD $0x858b544a // ldr z10, [x2, #93, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858b5849 // ldr z9, [x2, #94, MUL VL] + WORD $0x858b5c4a // ldr z10, [x2, #95, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 6 to 8 outputs + WORD $0x8580416b // ldr z11, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858c4049 // ldr z9, [x2, #96, MUL VL] + WORD $0x858c444a // ldr z10, [x2, #97, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858c4849 // ldr z9, [x2, #98, MUL VL] + WORD $0x858c4c4a // ldr z10, [x2, #99, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858c5049 // ldr z9, [x2, #100, MUL VL] + WORD $0x858c544a // ldr z10, [x2, #101, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858c5849 // ldr z9, [x2, #102, MUL VL] + WORD $0x858c5c4a // ldr z10, [x2, #103, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858d4049 // ldr z9, [x2, #104, MUL VL] + WORD $0x858d444a // ldr z10, [x2, #105, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858d4849 // ldr z9, [x2, #106, MUL VL] + WORD $0x858d4c4a // ldr z10, [x2, #107, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858d5049 // ldr z9, [x2, #108, MUL VL] + WORD $0x858d544a // ldr z10, [x2, #109, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858d5849 // ldr z9, [x2, #110, MUL VL] + WORD $0x858d5c4a // ldr z10, [x2, #111, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 7 to 8 outputs + WORD $0x8580418b // ldr z11, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x858e4049 // ldr z9, [x2, #112, MUL VL] + WORD $0x858e444a // ldr z10, [x2, #113, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x858e4849 // ldr z9, [x2, #114, MUL VL] + WORD $0x858e4c4a // ldr z10, [x2, #115, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x858e5049 // ldr z9, [x2, #116, MUL VL] + WORD $0x858e544a // ldr z10, [x2, #117, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x858e5849 // ldr z9, [x2, #118, MUL VL] + WORD $0x858e5c4a // ldr z10, [x2, #119, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x858f4049 // ldr z9, [x2, #120, MUL VL] + WORD $0x858f444a // ldr z10, [x2, #121, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x858f4849 // ldr z9, [x2, #122, MUL VL] + WORD $0x858f4c4a // ldr z10, [x2, #123, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x858f5049 // ldr z9, [x2, #124, MUL VL] + WORD $0x858f544a // ldr z10, [x2, #125, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x858f5849 // ldr z9, [x2, #126, MUL VL] + WORD $0x858f5c4a // ldr z10, [x2, #127, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 8 to 8 outputs + WORD $0x858041ab // ldr z11, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85904049 // ldr z9, [x2, #128, MUL VL] + WORD $0x8590444a // ldr z10, [x2, #129, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85904849 // ldr z9, [x2, #130, MUL VL] + WORD $0x85904c4a // ldr z10, [x2, #131, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85905049 // ldr z9, [x2, #132, MUL VL] + WORD $0x8590544a // ldr z10, [x2, #133, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85905849 // ldr z9, [x2, #134, MUL VL] + WORD $0x85905c4a // ldr z10, [x2, #135, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85914049 // ldr z9, [x2, #136, MUL VL] + WORD $0x8591444a // ldr z10, [x2, #137, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85914849 // ldr z9, [x2, #138, MUL VL] + WORD $0x85914c4a // ldr z10, [x2, #139, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85915049 // ldr z9, [x2, #140, MUL VL] + WORD $0x8591544a // ldr z10, [x2, #141, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85915849 // ldr z9, [x2, #142, MUL VL] + WORD $0x85915c4a // ldr z10, [x2, #143, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x8Xor_store + + // Load and process 32 bytes from input 9 to 8 outputs + WORD $0x8580406b // ldr z11, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc956c // lsr z12.d, z11.d, #4 + WORD $0x0428316b // and z11.d, z11.d, z8.d + WORD $0x0428318c // and z12.d, z12.d, z8.d + WORD $0x85924049 // ldr z9, [x2, #144, MUL VL] + WORD $0x8592444a // ldr z10, [x2, #145, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93000 // eor z0.d, z0.d, z9.d + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x85924849 // ldr z9, [x2, #146, MUL VL] + WORD $0x85924c4a // ldr z10, [x2, #147, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93021 // eor z1.d, z1.d, z9.d + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x85925049 // ldr z9, [x2, #148, MUL VL] + WORD $0x8592544a // ldr z10, [x2, #149, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93042 // eor z2.d, z2.d, z9.d + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x85925849 // ldr z9, [x2, #150, MUL VL] + WORD $0x85925c4a // ldr z10, [x2, #151, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93063 // eor z3.d, z3.d, z9.d + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x85934049 // ldr z9, [x2, #152, MUL VL] + WORD $0x8593444a // ldr z10, [x2, #153, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a93084 // eor z4.d, z4.d, z9.d + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x85934849 // ldr z9, [x2, #154, MUL VL] + WORD $0x85934c4a // ldr z10, [x2, #155, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930a5 // eor z5.d, z5.d, z9.d + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x85935049 // ldr z9, [x2, #156, MUL VL] + WORD $0x8593544a // ldr z10, [x2, #157, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930c6 // eor z6.d, z6.d, z9.d + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x85935849 // ldr z9, [x2, #158, MUL VL] + WORD $0x85935c4a // ldr z10, [x2, #159, MUL VL] + WORD $0x052b3129 // tbl z9.b, z9.b, z11.b + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x04a930e7 // eor z7.d, z7.d, z9.d + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + +mulSve_10x8Xor_store: + // Store 8 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x8Xor_loop + +mulSve_10x8Xor_end: + RET + +// func mulSve_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x9_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c9 // mov z9.d, x6 + WORD $0x05212129 // dup z9.b, z9.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + WORD $0x8580402c // ldr z12, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8580404a // ldr z10, [x2] + WORD $0x8580444b // ldr z11, [x2, #1, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3160 // eor z0.d, z11.d, z10.d + WORD $0x8580484a // ldr z10, [x2, #2, MUL VL] + WORD $0x85804c4b // ldr z11, [x2, #3, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3161 // eor z1.d, z11.d, z10.d + WORD $0x8580504a // ldr z10, [x2, #4, MUL VL] + WORD $0x8580544b // ldr z11, [x2, #5, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3162 // eor z2.d, z11.d, z10.d + WORD $0x8580584a // ldr z10, [x2, #6, MUL VL] + WORD $0x85805c4b // ldr z11, [x2, #7, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3163 // eor z3.d, z11.d, z10.d + WORD $0x8581404a // ldr z10, [x2, #8, MUL VL] + WORD $0x8581444b // ldr z11, [x2, #9, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3164 // eor z4.d, z11.d, z10.d + WORD $0x8581484a // ldr z10, [x2, #10, MUL VL] + WORD $0x85814c4b // ldr z11, [x2, #11, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3165 // eor z5.d, z11.d, z10.d + WORD $0x8581504a // ldr z10, [x2, #12, MUL VL] + WORD $0x8581544b // ldr z11, [x2, #13, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3166 // eor z6.d, z11.d, z10.d + WORD $0x8581584a // ldr z10, [x2, #14, MUL VL] + WORD $0x85815c4b // ldr z11, [x2, #15, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3167 // eor z7.d, z11.d, z10.d + WORD $0x8582404a // ldr z10, [x2, #16, MUL VL] + WORD $0x8582444b // ldr z11, [x2, #17, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3168 // eor z8.d, z11.d, z10.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 1 to 9 outputs + WORD $0x8580408c // ldr z12, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8582484a // ldr z10, [x2, #18, MUL VL] + WORD $0x85824c4b // ldr z11, [x2, #19, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8582504a // ldr z10, [x2, #20, MUL VL] + WORD $0x8582544b // ldr z11, [x2, #21, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8582584a // ldr z10, [x2, #22, MUL VL] + WORD $0x85825c4b // ldr z11, [x2, #23, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8583404a // ldr z10, [x2, #24, MUL VL] + WORD $0x8583444b // ldr z11, [x2, #25, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8583484a // ldr z10, [x2, #26, MUL VL] + WORD $0x85834c4b // ldr z11, [x2, #27, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8583504a // ldr z10, [x2, #28, MUL VL] + WORD $0x8583544b // ldr z11, [x2, #29, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8583584a // ldr z10, [x2, #30, MUL VL] + WORD $0x85835c4b // ldr z11, [x2, #31, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8584404a // ldr z10, [x2, #32, MUL VL] + WORD $0x8584444b // ldr z11, [x2, #33, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8584484a // ldr z10, [x2, #34, MUL VL] + WORD $0x85844c4b // ldr z11, [x2, #35, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 2 to 9 outputs + WORD $0x858040ac // ldr z12, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8584504a // ldr z10, [x2, #36, MUL VL] + WORD $0x8584544b // ldr z11, [x2, #37, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8584584a // ldr z10, [x2, #38, MUL VL] + WORD $0x85845c4b // ldr z11, [x2, #39, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8585404a // ldr z10, [x2, #40, MUL VL] + WORD $0x8585444b // ldr z11, [x2, #41, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8585484a // ldr z10, [x2, #42, MUL VL] + WORD $0x85854c4b // ldr z11, [x2, #43, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8585504a // ldr z10, [x2, #44, MUL VL] + WORD $0x8585544b // ldr z11, [x2, #45, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8585584a // ldr z10, [x2, #46, MUL VL] + WORD $0x85855c4b // ldr z11, [x2, #47, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8586404a // ldr z10, [x2, #48, MUL VL] + WORD $0x8586444b // ldr z11, [x2, #49, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8586484a // ldr z10, [x2, #50, MUL VL] + WORD $0x85864c4b // ldr z11, [x2, #51, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8586504a // ldr z10, [x2, #52, MUL VL] + WORD $0x8586544b // ldr z11, [x2, #53, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 3 to 9 outputs + WORD $0x8580410c // ldr z12, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8586584a // ldr z10, [x2, #54, MUL VL] + WORD $0x85865c4b // ldr z11, [x2, #55, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8587404a // ldr z10, [x2, #56, MUL VL] + WORD $0x8587444b // ldr z11, [x2, #57, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8587484a // ldr z10, [x2, #58, MUL VL] + WORD $0x85874c4b // ldr z11, [x2, #59, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8587504a // ldr z10, [x2, #60, MUL VL] + WORD $0x8587544b // ldr z11, [x2, #61, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8587584a // ldr z10, [x2, #62, MUL VL] + WORD $0x85875c4b // ldr z11, [x2, #63, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8588404a // ldr z10, [x2, #64, MUL VL] + WORD $0x8588444b // ldr z11, [x2, #65, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8588484a // ldr z10, [x2, #66, MUL VL] + WORD $0x85884c4b // ldr z11, [x2, #67, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8588504a // ldr z10, [x2, #68, MUL VL] + WORD $0x8588544b // ldr z11, [x2, #69, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8588584a // ldr z10, [x2, #70, MUL VL] + WORD $0x85885c4b // ldr z11, [x2, #71, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 4 to 9 outputs + WORD $0x8580412c // ldr z12, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8589404a // ldr z10, [x2, #72, MUL VL] + WORD $0x8589444b // ldr z11, [x2, #73, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8589484a // ldr z10, [x2, #74, MUL VL] + WORD $0x85894c4b // ldr z11, [x2, #75, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8589504a // ldr z10, [x2, #76, MUL VL] + WORD $0x8589544b // ldr z11, [x2, #77, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8589584a // ldr z10, [x2, #78, MUL VL] + WORD $0x85895c4b // ldr z11, [x2, #79, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858a404a // ldr z10, [x2, #80, MUL VL] + WORD $0x858a444b // ldr z11, [x2, #81, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858a484a // ldr z10, [x2, #82, MUL VL] + WORD $0x858a4c4b // ldr z11, [x2, #83, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858a504a // ldr z10, [x2, #84, MUL VL] + WORD $0x858a544b // ldr z11, [x2, #85, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858a584a // ldr z10, [x2, #86, MUL VL] + WORD $0x858a5c4b // ldr z11, [x2, #87, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858b404a // ldr z10, [x2, #88, MUL VL] + WORD $0x858b444b // ldr z11, [x2, #89, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 5 to 9 outputs + WORD $0x8580414c // ldr z12, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858b484a // ldr z10, [x2, #90, MUL VL] + WORD $0x858b4c4b // ldr z11, [x2, #91, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858b504a // ldr z10, [x2, #92, MUL VL] + WORD $0x858b544b // ldr z11, [x2, #93, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858b584a // ldr z10, [x2, #94, MUL VL] + WORD $0x858b5c4b // ldr z11, [x2, #95, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858c404a // ldr z10, [x2, #96, MUL VL] + WORD $0x858c444b // ldr z11, [x2, #97, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858c484a // ldr z10, [x2, #98, MUL VL] + WORD $0x858c4c4b // ldr z11, [x2, #99, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858c504a // ldr z10, [x2, #100, MUL VL] + WORD $0x858c544b // ldr z11, [x2, #101, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858c584a // ldr z10, [x2, #102, MUL VL] + WORD $0x858c5c4b // ldr z11, [x2, #103, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858d404a // ldr z10, [x2, #104, MUL VL] + WORD $0x858d444b // ldr z11, [x2, #105, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858d484a // ldr z10, [x2, #106, MUL VL] + WORD $0x858d4c4b // ldr z11, [x2, #107, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 6 to 9 outputs + WORD $0x8580416c // ldr z12, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858d504a // ldr z10, [x2, #108, MUL VL] + WORD $0x858d544b // ldr z11, [x2, #109, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858d584a // ldr z10, [x2, #110, MUL VL] + WORD $0x858d5c4b // ldr z11, [x2, #111, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858e404a // ldr z10, [x2, #112, MUL VL] + WORD $0x858e444b // ldr z11, [x2, #113, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858e484a // ldr z10, [x2, #114, MUL VL] + WORD $0x858e4c4b // ldr z11, [x2, #115, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858e504a // ldr z10, [x2, #116, MUL VL] + WORD $0x858e544b // ldr z11, [x2, #117, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858e584a // ldr z10, [x2, #118, MUL VL] + WORD $0x858e5c4b // ldr z11, [x2, #119, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858f404a // ldr z10, [x2, #120, MUL VL] + WORD $0x858f444b // ldr z11, [x2, #121, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858f484a // ldr z10, [x2, #122, MUL VL] + WORD $0x858f4c4b // ldr z11, [x2, #123, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858f504a // ldr z10, [x2, #124, MUL VL] + WORD $0x858f544b // ldr z11, [x2, #125, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 7 to 9 outputs + WORD $0x8580418c // ldr z12, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858f584a // ldr z10, [x2, #126, MUL VL] + WORD $0x858f5c4b // ldr z11, [x2, #127, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8590404a // ldr z10, [x2, #128, MUL VL] + WORD $0x8590444b // ldr z11, [x2, #129, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8590484a // ldr z10, [x2, #130, MUL VL] + WORD $0x85904c4b // ldr z11, [x2, #131, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8590504a // ldr z10, [x2, #132, MUL VL] + WORD $0x8590544b // ldr z11, [x2, #133, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8590584a // ldr z10, [x2, #134, MUL VL] + WORD $0x85905c4b // ldr z11, [x2, #135, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8591404a // ldr z10, [x2, #136, MUL VL] + WORD $0x8591444b // ldr z11, [x2, #137, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8591484a // ldr z10, [x2, #138, MUL VL] + WORD $0x85914c4b // ldr z11, [x2, #139, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8591504a // ldr z10, [x2, #140, MUL VL] + WORD $0x8591544b // ldr z11, [x2, #141, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8591584a // ldr z10, [x2, #142, MUL VL] + WORD $0x85915c4b // ldr z11, [x2, #143, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 8 to 9 outputs + WORD $0x858041ac // ldr z12, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8592404a // ldr z10, [x2, #144, MUL VL] + WORD $0x8592444b // ldr z11, [x2, #145, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8592484a // ldr z10, [x2, #146, MUL VL] + WORD $0x85924c4b // ldr z11, [x2, #147, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8592504a // ldr z10, [x2, #148, MUL VL] + WORD $0x8592544b // ldr z11, [x2, #149, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8592584a // ldr z10, [x2, #150, MUL VL] + WORD $0x85925c4b // ldr z11, [x2, #151, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8593404a // ldr z10, [x2, #152, MUL VL] + WORD $0x8593444b // ldr z11, [x2, #153, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8593484a // ldr z10, [x2, #154, MUL VL] + WORD $0x85934c4b // ldr z11, [x2, #155, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8593504a // ldr z10, [x2, #156, MUL VL] + WORD $0x8593544b // ldr z11, [x2, #157, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8593584a // ldr z10, [x2, #158, MUL VL] + WORD $0x85935c4b // ldr z11, [x2, #159, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8594404a // ldr z10, [x2, #160, MUL VL] + WORD $0x8594444b // ldr z11, [x2, #161, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x9_store + + // Load and process 32 bytes from input 9 to 9 outputs + WORD $0x8580406c // ldr z12, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8594484a // ldr z10, [x2, #162, MUL VL] + WORD $0x85944c4b // ldr z11, [x2, #163, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8594504a // ldr z10, [x2, #164, MUL VL] + WORD $0x8594544b // ldr z11, [x2, #165, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8594584a // ldr z10, [x2, #166, MUL VL] + WORD $0x85945c4b // ldr z11, [x2, #167, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8595404a // ldr z10, [x2, #168, MUL VL] + WORD $0x8595444b // ldr z11, [x2, #169, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8595484a // ldr z10, [x2, #170, MUL VL] + WORD $0x85954c4b // ldr z11, [x2, #171, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8595504a // ldr z10, [x2, #172, MUL VL] + WORD $0x8595544b // ldr z11, [x2, #173, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8595584a // ldr z10, [x2, #174, MUL VL] + WORD $0x85955c4b // ldr z11, [x2, #175, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8596404a // ldr z10, [x2, #176, MUL VL] + WORD $0x8596444b // ldr z11, [x2, #177, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8596484a // ldr z10, [x2, #178, MUL VL] + WORD $0x85964c4b // ldr z11, [x2, #179, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + +mulSve_10x9_store: + // Store 9 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x9_loop + +mulSve_10x9_end: + RET + +// func mulSve_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x9Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038c9 // mov z9.d, x6 + WORD $0x05212129 // dup z9.b, z9.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + WORD $0x8580402c // ldr z12, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580404a // ldr z10, [x2] + WORD $0x8580444b // ldr z11, [x2, #1, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580484a // ldr z10, [x2, #2, MUL VL] + WORD $0x85804c4b // ldr z11, [x2, #3, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580504a // ldr z10, [x2, #4, MUL VL] + WORD $0x8580544b // ldr z11, [x2, #5, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580584a // ldr z10, [x2, #6, MUL VL] + WORD $0x85805c4b // ldr z11, [x2, #7, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581404a // ldr z10, [x2, #8, MUL VL] + WORD $0x8581444b // ldr z11, [x2, #9, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581484a // ldr z10, [x2, #10, MUL VL] + WORD $0x85814c4b // ldr z11, [x2, #11, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581504a // ldr z10, [x2, #12, MUL VL] + WORD $0x8581544b // ldr z11, [x2, #13, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581584a // ldr z10, [x2, #14, MUL VL] + WORD $0x85815c4b // ldr z11, [x2, #15, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + MOVD 192(R14), R6 + WORD $0xa5ef40c8 // ld1d { z8.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582404a // ldr z10, [x2, #16, MUL VL] + WORD $0x8582444b // ldr z11, [x2, #17, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 1 to 9 outputs + WORD $0x8580408c // ldr z12, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8582484a // ldr z10, [x2, #18, MUL VL] + WORD $0x85824c4b // ldr z11, [x2, #19, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8582504a // ldr z10, [x2, #20, MUL VL] + WORD $0x8582544b // ldr z11, [x2, #21, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8582584a // ldr z10, [x2, #22, MUL VL] + WORD $0x85825c4b // ldr z11, [x2, #23, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8583404a // ldr z10, [x2, #24, MUL VL] + WORD $0x8583444b // ldr z11, [x2, #25, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8583484a // ldr z10, [x2, #26, MUL VL] + WORD $0x85834c4b // ldr z11, [x2, #27, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8583504a // ldr z10, [x2, #28, MUL VL] + WORD $0x8583544b // ldr z11, [x2, #29, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8583584a // ldr z10, [x2, #30, MUL VL] + WORD $0x85835c4b // ldr z11, [x2, #31, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8584404a // ldr z10, [x2, #32, MUL VL] + WORD $0x8584444b // ldr z11, [x2, #33, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8584484a // ldr z10, [x2, #34, MUL VL] + WORD $0x85844c4b // ldr z11, [x2, #35, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 2 to 9 outputs + WORD $0x858040ac // ldr z12, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8584504a // ldr z10, [x2, #36, MUL VL] + WORD $0x8584544b // ldr z11, [x2, #37, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8584584a // ldr z10, [x2, #38, MUL VL] + WORD $0x85845c4b // ldr z11, [x2, #39, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8585404a // ldr z10, [x2, #40, MUL VL] + WORD $0x8585444b // ldr z11, [x2, #41, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8585484a // ldr z10, [x2, #42, MUL VL] + WORD $0x85854c4b // ldr z11, [x2, #43, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8585504a // ldr z10, [x2, #44, MUL VL] + WORD $0x8585544b // ldr z11, [x2, #45, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8585584a // ldr z10, [x2, #46, MUL VL] + WORD $0x85855c4b // ldr z11, [x2, #47, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8586404a // ldr z10, [x2, #48, MUL VL] + WORD $0x8586444b // ldr z11, [x2, #49, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8586484a // ldr z10, [x2, #50, MUL VL] + WORD $0x85864c4b // ldr z11, [x2, #51, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8586504a // ldr z10, [x2, #52, MUL VL] + WORD $0x8586544b // ldr z11, [x2, #53, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 3 to 9 outputs + WORD $0x8580410c // ldr z12, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8586584a // ldr z10, [x2, #54, MUL VL] + WORD $0x85865c4b // ldr z11, [x2, #55, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8587404a // ldr z10, [x2, #56, MUL VL] + WORD $0x8587444b // ldr z11, [x2, #57, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8587484a // ldr z10, [x2, #58, MUL VL] + WORD $0x85874c4b // ldr z11, [x2, #59, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8587504a // ldr z10, [x2, #60, MUL VL] + WORD $0x8587544b // ldr z11, [x2, #61, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8587584a // ldr z10, [x2, #62, MUL VL] + WORD $0x85875c4b // ldr z11, [x2, #63, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8588404a // ldr z10, [x2, #64, MUL VL] + WORD $0x8588444b // ldr z11, [x2, #65, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8588484a // ldr z10, [x2, #66, MUL VL] + WORD $0x85884c4b // ldr z11, [x2, #67, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8588504a // ldr z10, [x2, #68, MUL VL] + WORD $0x8588544b // ldr z11, [x2, #69, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8588584a // ldr z10, [x2, #70, MUL VL] + WORD $0x85885c4b // ldr z11, [x2, #71, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 4 to 9 outputs + WORD $0x8580412c // ldr z12, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8589404a // ldr z10, [x2, #72, MUL VL] + WORD $0x8589444b // ldr z11, [x2, #73, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8589484a // ldr z10, [x2, #74, MUL VL] + WORD $0x85894c4b // ldr z11, [x2, #75, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8589504a // ldr z10, [x2, #76, MUL VL] + WORD $0x8589544b // ldr z11, [x2, #77, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8589584a // ldr z10, [x2, #78, MUL VL] + WORD $0x85895c4b // ldr z11, [x2, #79, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858a404a // ldr z10, [x2, #80, MUL VL] + WORD $0x858a444b // ldr z11, [x2, #81, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858a484a // ldr z10, [x2, #82, MUL VL] + WORD $0x858a4c4b // ldr z11, [x2, #83, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858a504a // ldr z10, [x2, #84, MUL VL] + WORD $0x858a544b // ldr z11, [x2, #85, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858a584a // ldr z10, [x2, #86, MUL VL] + WORD $0x858a5c4b // ldr z11, [x2, #87, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858b404a // ldr z10, [x2, #88, MUL VL] + WORD $0x858b444b // ldr z11, [x2, #89, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 5 to 9 outputs + WORD $0x8580414c // ldr z12, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858b484a // ldr z10, [x2, #90, MUL VL] + WORD $0x858b4c4b // ldr z11, [x2, #91, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858b504a // ldr z10, [x2, #92, MUL VL] + WORD $0x858b544b // ldr z11, [x2, #93, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858b584a // ldr z10, [x2, #94, MUL VL] + WORD $0x858b5c4b // ldr z11, [x2, #95, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858c404a // ldr z10, [x2, #96, MUL VL] + WORD $0x858c444b // ldr z11, [x2, #97, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858c484a // ldr z10, [x2, #98, MUL VL] + WORD $0x858c4c4b // ldr z11, [x2, #99, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858c504a // ldr z10, [x2, #100, MUL VL] + WORD $0x858c544b // ldr z11, [x2, #101, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858c584a // ldr z10, [x2, #102, MUL VL] + WORD $0x858c5c4b // ldr z11, [x2, #103, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858d404a // ldr z10, [x2, #104, MUL VL] + WORD $0x858d444b // ldr z11, [x2, #105, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858d484a // ldr z10, [x2, #106, MUL VL] + WORD $0x858d4c4b // ldr z11, [x2, #107, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 6 to 9 outputs + WORD $0x8580416c // ldr z12, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858d504a // ldr z10, [x2, #108, MUL VL] + WORD $0x858d544b // ldr z11, [x2, #109, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x858d584a // ldr z10, [x2, #110, MUL VL] + WORD $0x858d5c4b // ldr z11, [x2, #111, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x858e404a // ldr z10, [x2, #112, MUL VL] + WORD $0x858e444b // ldr z11, [x2, #113, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x858e484a // ldr z10, [x2, #114, MUL VL] + WORD $0x858e4c4b // ldr z11, [x2, #115, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x858e504a // ldr z10, [x2, #116, MUL VL] + WORD $0x858e544b // ldr z11, [x2, #117, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x858e584a // ldr z10, [x2, #118, MUL VL] + WORD $0x858e5c4b // ldr z11, [x2, #119, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x858f404a // ldr z10, [x2, #120, MUL VL] + WORD $0x858f444b // ldr z11, [x2, #121, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x858f484a // ldr z10, [x2, #122, MUL VL] + WORD $0x858f4c4b // ldr z11, [x2, #123, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x858f504a // ldr z10, [x2, #124, MUL VL] + WORD $0x858f544b // ldr z11, [x2, #125, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 7 to 9 outputs + WORD $0x8580418c // ldr z12, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x858f584a // ldr z10, [x2, #126, MUL VL] + WORD $0x858f5c4b // ldr z11, [x2, #127, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8590404a // ldr z10, [x2, #128, MUL VL] + WORD $0x8590444b // ldr z11, [x2, #129, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8590484a // ldr z10, [x2, #130, MUL VL] + WORD $0x85904c4b // ldr z11, [x2, #131, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8590504a // ldr z10, [x2, #132, MUL VL] + WORD $0x8590544b // ldr z11, [x2, #133, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8590584a // ldr z10, [x2, #134, MUL VL] + WORD $0x85905c4b // ldr z11, [x2, #135, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8591404a // ldr z10, [x2, #136, MUL VL] + WORD $0x8591444b // ldr z11, [x2, #137, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8591484a // ldr z10, [x2, #138, MUL VL] + WORD $0x85914c4b // ldr z11, [x2, #139, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8591504a // ldr z10, [x2, #140, MUL VL] + WORD $0x8591544b // ldr z11, [x2, #141, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8591584a // ldr z10, [x2, #142, MUL VL] + WORD $0x85915c4b // ldr z11, [x2, #143, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 8 to 9 outputs + WORD $0x858041ac // ldr z12, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8592404a // ldr z10, [x2, #144, MUL VL] + WORD $0x8592444b // ldr z11, [x2, #145, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8592484a // ldr z10, [x2, #146, MUL VL] + WORD $0x85924c4b // ldr z11, [x2, #147, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8592504a // ldr z10, [x2, #148, MUL VL] + WORD $0x8592544b // ldr z11, [x2, #149, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8592584a // ldr z10, [x2, #150, MUL VL] + WORD $0x85925c4b // ldr z11, [x2, #151, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8593404a // ldr z10, [x2, #152, MUL VL] + WORD $0x8593444b // ldr z11, [x2, #153, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8593484a // ldr z10, [x2, #154, MUL VL] + WORD $0x85934c4b // ldr z11, [x2, #155, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8593504a // ldr z10, [x2, #156, MUL VL] + WORD $0x8593544b // ldr z11, [x2, #157, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8593584a // ldr z10, [x2, #158, MUL VL] + WORD $0x85935c4b // ldr z11, [x2, #159, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8594404a // ldr z10, [x2, #160, MUL VL] + WORD $0x8594444b // ldr z11, [x2, #161, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x9Xor_store + + // Load and process 32 bytes from input 9 to 9 outputs + WORD $0x8580406c // ldr z12, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc958d // lsr z13.d, z12.d, #4 + WORD $0x0429318c // and z12.d, z12.d, z9.d + WORD $0x042931ad // and z13.d, z13.d, z9.d + WORD $0x8594484a // ldr z10, [x2, #162, MUL VL] + WORD $0x85944c4b // ldr z11, [x2, #163, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3000 // eor z0.d, z0.d, z10.d + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x8594504a // ldr z10, [x2, #164, MUL VL] + WORD $0x8594544b // ldr z11, [x2, #165, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3021 // eor z1.d, z1.d, z10.d + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x8594584a // ldr z10, [x2, #166, MUL VL] + WORD $0x85945c4b // ldr z11, [x2, #167, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3042 // eor z2.d, z2.d, z10.d + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x8595404a // ldr z10, [x2, #168, MUL VL] + WORD $0x8595444b // ldr z11, [x2, #169, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3063 // eor z3.d, z3.d, z10.d + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x8595484a // ldr z10, [x2, #170, MUL VL] + WORD $0x85954c4b // ldr z11, [x2, #171, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3084 // eor z4.d, z4.d, z10.d + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x8595504a // ldr z10, [x2, #172, MUL VL] + WORD $0x8595544b // ldr z11, [x2, #173, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30a5 // eor z5.d, z5.d, z10.d + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x8595584a // ldr z10, [x2, #174, MUL VL] + WORD $0x85955c4b // ldr z11, [x2, #175, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30c6 // eor z6.d, z6.d, z10.d + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x8596404a // ldr z10, [x2, #176, MUL VL] + WORD $0x8596444b // ldr z11, [x2, #177, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa30e7 // eor z7.d, z7.d, z10.d + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x8596484a // ldr z10, [x2, #178, MUL VL] + WORD $0x85964c4b // ldr z11, [x2, #179, MUL VL] + WORD $0x052c314a // tbl z10.b, z10.b, z12.b + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x04aa3108 // eor z8.d, z8.d, z10.d + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + +mulSve_10x9Xor_store: + // Store 9 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x9Xor_loop + +mulSve_10x9Xor_end: + RET + +// func mulSve_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x10_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038ca // mov z10.d, x6 + WORD $0x0521214a // dup z10.b, z10.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + WORD $0x8580402d // ldr z13, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8580404b // ldr z11, [x2] + WORD $0x8580444c // ldr z12, [x2, #1, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3180 // eor z0.d, z12.d, z11.d + WORD $0x8580484b // ldr z11, [x2, #2, MUL VL] + WORD $0x85804c4c // ldr z12, [x2, #3, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3181 // eor z1.d, z12.d, z11.d + WORD $0x8580504b // ldr z11, [x2, #4, MUL VL] + WORD $0x8580544c // ldr z12, [x2, #5, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3182 // eor z2.d, z12.d, z11.d + WORD $0x8580584b // ldr z11, [x2, #6, MUL VL] + WORD $0x85805c4c // ldr z12, [x2, #7, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3183 // eor z3.d, z12.d, z11.d + WORD $0x8581404b // ldr z11, [x2, #8, MUL VL] + WORD $0x8581444c // ldr z12, [x2, #9, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3184 // eor z4.d, z12.d, z11.d + WORD $0x8581484b // ldr z11, [x2, #10, MUL VL] + WORD $0x85814c4c // ldr z12, [x2, #11, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3185 // eor z5.d, z12.d, z11.d + WORD $0x8581504b // ldr z11, [x2, #12, MUL VL] + WORD $0x8581544c // ldr z12, [x2, #13, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3186 // eor z6.d, z12.d, z11.d + WORD $0x8581584b // ldr z11, [x2, #14, MUL VL] + WORD $0x85815c4c // ldr z12, [x2, #15, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3187 // eor z7.d, z12.d, z11.d + WORD $0x8582404b // ldr z11, [x2, #16, MUL VL] + WORD $0x8582444c // ldr z12, [x2, #17, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3188 // eor z8.d, z12.d, z11.d + WORD $0x8582484b // ldr z11, [x2, #18, MUL VL] + WORD $0x85824c4c // ldr z12, [x2, #19, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3189 // eor z9.d, z12.d, z11.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 1 to 10 outputs + WORD $0x8580408d // ldr z13, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8582504b // ldr z11, [x2, #20, MUL VL] + WORD $0x8582544c // ldr z12, [x2, #21, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8582584b // ldr z11, [x2, #22, MUL VL] + WORD $0x85825c4c // ldr z12, [x2, #23, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8583404b // ldr z11, [x2, #24, MUL VL] + WORD $0x8583444c // ldr z12, [x2, #25, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8583484b // ldr z11, [x2, #26, MUL VL] + WORD $0x85834c4c // ldr z12, [x2, #27, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8583504b // ldr z11, [x2, #28, MUL VL] + WORD $0x8583544c // ldr z12, [x2, #29, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8583584b // ldr z11, [x2, #30, MUL VL] + WORD $0x85835c4c // ldr z12, [x2, #31, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8584404b // ldr z11, [x2, #32, MUL VL] + WORD $0x8584444c // ldr z12, [x2, #33, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8584484b // ldr z11, [x2, #34, MUL VL] + WORD $0x85844c4c // ldr z12, [x2, #35, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8584504b // ldr z11, [x2, #36, MUL VL] + WORD $0x8584544c // ldr z12, [x2, #37, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8584584b // ldr z11, [x2, #38, MUL VL] + WORD $0x85845c4c // ldr z12, [x2, #39, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 2 to 10 outputs + WORD $0x858040ad // ldr z13, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8585404b // ldr z11, [x2, #40, MUL VL] + WORD $0x8585444c // ldr z12, [x2, #41, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8585484b // ldr z11, [x2, #42, MUL VL] + WORD $0x85854c4c // ldr z12, [x2, #43, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8585504b // ldr z11, [x2, #44, MUL VL] + WORD $0x8585544c // ldr z12, [x2, #45, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8585584b // ldr z11, [x2, #46, MUL VL] + WORD $0x85855c4c // ldr z12, [x2, #47, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8586404b // ldr z11, [x2, #48, MUL VL] + WORD $0x8586444c // ldr z12, [x2, #49, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8586484b // ldr z11, [x2, #50, MUL VL] + WORD $0x85864c4c // ldr z12, [x2, #51, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8586504b // ldr z11, [x2, #52, MUL VL] + WORD $0x8586544c // ldr z12, [x2, #53, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8586584b // ldr z11, [x2, #54, MUL VL] + WORD $0x85865c4c // ldr z12, [x2, #55, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8587404b // ldr z11, [x2, #56, MUL VL] + WORD $0x8587444c // ldr z12, [x2, #57, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8587484b // ldr z11, [x2, #58, MUL VL] + WORD $0x85874c4c // ldr z12, [x2, #59, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 3 to 10 outputs + WORD $0x8580410d // ldr z13, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8587504b // ldr z11, [x2, #60, MUL VL] + WORD $0x8587544c // ldr z12, [x2, #61, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8587584b // ldr z11, [x2, #62, MUL VL] + WORD $0x85875c4c // ldr z12, [x2, #63, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8588404b // ldr z11, [x2, #64, MUL VL] + WORD $0x8588444c // ldr z12, [x2, #65, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8588484b // ldr z11, [x2, #66, MUL VL] + WORD $0x85884c4c // ldr z12, [x2, #67, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8588504b // ldr z11, [x2, #68, MUL VL] + WORD $0x8588544c // ldr z12, [x2, #69, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8588584b // ldr z11, [x2, #70, MUL VL] + WORD $0x85885c4c // ldr z12, [x2, #71, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8589404b // ldr z11, [x2, #72, MUL VL] + WORD $0x8589444c // ldr z12, [x2, #73, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8589484b // ldr z11, [x2, #74, MUL VL] + WORD $0x85894c4c // ldr z12, [x2, #75, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8589504b // ldr z11, [x2, #76, MUL VL] + WORD $0x8589544c // ldr z12, [x2, #77, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8589584b // ldr z11, [x2, #78, MUL VL] + WORD $0x85895c4c // ldr z12, [x2, #79, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 4 to 10 outputs + WORD $0x8580412d // ldr z13, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858a404b // ldr z11, [x2, #80, MUL VL] + WORD $0x858a444c // ldr z12, [x2, #81, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858a484b // ldr z11, [x2, #82, MUL VL] + WORD $0x858a4c4c // ldr z12, [x2, #83, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858a504b // ldr z11, [x2, #84, MUL VL] + WORD $0x858a544c // ldr z12, [x2, #85, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858a584b // ldr z11, [x2, #86, MUL VL] + WORD $0x858a5c4c // ldr z12, [x2, #87, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858b404b // ldr z11, [x2, #88, MUL VL] + WORD $0x858b444c // ldr z12, [x2, #89, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858b484b // ldr z11, [x2, #90, MUL VL] + WORD $0x858b4c4c // ldr z12, [x2, #91, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858b504b // ldr z11, [x2, #92, MUL VL] + WORD $0x858b544c // ldr z12, [x2, #93, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858b584b // ldr z11, [x2, #94, MUL VL] + WORD $0x858b5c4c // ldr z12, [x2, #95, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858c404b // ldr z11, [x2, #96, MUL VL] + WORD $0x858c444c // ldr z12, [x2, #97, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858c484b // ldr z11, [x2, #98, MUL VL] + WORD $0x858c4c4c // ldr z12, [x2, #99, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 5 to 10 outputs + WORD $0x8580414d // ldr z13, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858c504b // ldr z11, [x2, #100, MUL VL] + WORD $0x858c544c // ldr z12, [x2, #101, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858c584b // ldr z11, [x2, #102, MUL VL] + WORD $0x858c5c4c // ldr z12, [x2, #103, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858d404b // ldr z11, [x2, #104, MUL VL] + WORD $0x858d444c // ldr z12, [x2, #105, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858d484b // ldr z11, [x2, #106, MUL VL] + WORD $0x858d4c4c // ldr z12, [x2, #107, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858d504b // ldr z11, [x2, #108, MUL VL] + WORD $0x858d544c // ldr z12, [x2, #109, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858d584b // ldr z11, [x2, #110, MUL VL] + WORD $0x858d5c4c // ldr z12, [x2, #111, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858e404b // ldr z11, [x2, #112, MUL VL] + WORD $0x858e444c // ldr z12, [x2, #113, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858e484b // ldr z11, [x2, #114, MUL VL] + WORD $0x858e4c4c // ldr z12, [x2, #115, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858e504b // ldr z11, [x2, #116, MUL VL] + WORD $0x858e544c // ldr z12, [x2, #117, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858e584b // ldr z11, [x2, #118, MUL VL] + WORD $0x858e5c4c // ldr z12, [x2, #119, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 6 to 10 outputs + WORD $0x8580416d // ldr z13, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858f404b // ldr z11, [x2, #120, MUL VL] + WORD $0x858f444c // ldr z12, [x2, #121, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858f484b // ldr z11, [x2, #122, MUL VL] + WORD $0x858f4c4c // ldr z12, [x2, #123, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858f504b // ldr z11, [x2, #124, MUL VL] + WORD $0x858f544c // ldr z12, [x2, #125, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858f584b // ldr z11, [x2, #126, MUL VL] + WORD $0x858f5c4c // ldr z12, [x2, #127, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8590404b // ldr z11, [x2, #128, MUL VL] + WORD $0x8590444c // ldr z12, [x2, #129, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8590484b // ldr z11, [x2, #130, MUL VL] + WORD $0x85904c4c // ldr z12, [x2, #131, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8590504b // ldr z11, [x2, #132, MUL VL] + WORD $0x8590544c // ldr z12, [x2, #133, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8590584b // ldr z11, [x2, #134, MUL VL] + WORD $0x85905c4c // ldr z12, [x2, #135, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8591404b // ldr z11, [x2, #136, MUL VL] + WORD $0x8591444c // ldr z12, [x2, #137, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8591484b // ldr z11, [x2, #138, MUL VL] + WORD $0x85914c4c // ldr z12, [x2, #139, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 7 to 10 outputs + WORD $0x8580418d // ldr z13, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8591504b // ldr z11, [x2, #140, MUL VL] + WORD $0x8591544c // ldr z12, [x2, #141, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8591584b // ldr z11, [x2, #142, MUL VL] + WORD $0x85915c4c // ldr z12, [x2, #143, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8592404b // ldr z11, [x2, #144, MUL VL] + WORD $0x8592444c // ldr z12, [x2, #145, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8592484b // ldr z11, [x2, #146, MUL VL] + WORD $0x85924c4c // ldr z12, [x2, #147, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8592504b // ldr z11, [x2, #148, MUL VL] + WORD $0x8592544c // ldr z12, [x2, #149, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8592584b // ldr z11, [x2, #150, MUL VL] + WORD $0x85925c4c // ldr z12, [x2, #151, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8593404b // ldr z11, [x2, #152, MUL VL] + WORD $0x8593444c // ldr z12, [x2, #153, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8593484b // ldr z11, [x2, #154, MUL VL] + WORD $0x85934c4c // ldr z12, [x2, #155, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8593504b // ldr z11, [x2, #156, MUL VL] + WORD $0x8593544c // ldr z12, [x2, #157, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8593584b // ldr z11, [x2, #158, MUL VL] + WORD $0x85935c4c // ldr z12, [x2, #159, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 8 to 10 outputs + WORD $0x858041ad // ldr z13, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8594404b // ldr z11, [x2, #160, MUL VL] + WORD $0x8594444c // ldr z12, [x2, #161, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8594484b // ldr z11, [x2, #162, MUL VL] + WORD $0x85944c4c // ldr z12, [x2, #163, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8594504b // ldr z11, [x2, #164, MUL VL] + WORD $0x8594544c // ldr z12, [x2, #165, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8594584b // ldr z11, [x2, #166, MUL VL] + WORD $0x85945c4c // ldr z12, [x2, #167, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8595404b // ldr z11, [x2, #168, MUL VL] + WORD $0x8595444c // ldr z12, [x2, #169, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8595484b // ldr z11, [x2, #170, MUL VL] + WORD $0x85954c4c // ldr z12, [x2, #171, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8595504b // ldr z11, [x2, #172, MUL VL] + WORD $0x8595544c // ldr z12, [x2, #173, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8595584b // ldr z11, [x2, #174, MUL VL] + WORD $0x85955c4c // ldr z12, [x2, #175, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8596404b // ldr z11, [x2, #176, MUL VL] + WORD $0x8596444c // ldr z12, [x2, #177, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8596484b // ldr z11, [x2, #178, MUL VL] + WORD $0x85964c4c // ldr z12, [x2, #179, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x10_store + + // Load and process 32 bytes from input 9 to 10 outputs + WORD $0x8580406d // ldr z13, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8596504b // ldr z11, [x2, #180, MUL VL] + WORD $0x8596544c // ldr z12, [x2, #181, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8596584b // ldr z11, [x2, #182, MUL VL] + WORD $0x85965c4c // ldr z12, [x2, #183, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8597404b // ldr z11, [x2, #184, MUL VL] + WORD $0x8597444c // ldr z12, [x2, #185, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8597484b // ldr z11, [x2, #186, MUL VL] + WORD $0x85974c4c // ldr z12, [x2, #187, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8597504b // ldr z11, [x2, #188, MUL VL] + WORD $0x8597544c // ldr z12, [x2, #189, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8597584b // ldr z11, [x2, #190, MUL VL] + WORD $0x85975c4c // ldr z12, [x2, #191, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8598404b // ldr z11, [x2, #192, MUL VL] + WORD $0x8598444c // ldr z12, [x2, #193, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8598484b // ldr z11, [x2, #194, MUL VL] + WORD $0x85984c4c // ldr z12, [x2, #195, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8598504b // ldr z11, [x2, #196, MUL VL] + WORD $0x8598544c // ldr z12, [x2, #197, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8598584b // ldr z11, [x2, #198, MUL VL] + WORD $0x85985c4c // ldr z12, [x2, #199, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + +mulSve_10x10_store: + // Store 10 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + MOVD 216(R14), R6 + WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x10_loop + +mulSve_10x10_end: + RET + +// func mulSve_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: SVE +TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 + WORD $0x25d8e3e0 // ptrue p0.d + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + MOVD matrix_base+0(FP), R2 + WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xea00001f // tst x0, x0 + BEQ mulSve_10x10Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + WORD $0x8b0f0021 // add x1, x1, x15 + WORD $0x8b0f0084 // add x4, x4, x15 + WORD $0x8b0f00a5 // add x5, x5, x15 + WORD $0x8b0f0108 // add x8, x8, x15 + WORD $0x8b0f0129 // add x9, x9, x15 + WORD $0x8b0f014a // add x10, x10, x15 + WORD $0x8b0f016b // add x11, x11, x15 + WORD $0x8b0f018c // add x12, x12, x15 + WORD $0x8b0f01ad // add x13, x13, x15 + WORD $0x8b0f0063 // add x3, x3, x15 + WORD $0xd343fdef // lsr x15, x15, #3 + WORD $0xd28001e6 // mov x6, #15 + WORD $0x05e038ca // mov z10.d, x6 + WORD $0x0521214a // dup z10.b, z10.b[0] + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulSve_10x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + WORD $0x8580402d // ldr z13, [x1] + WORD $0x91008021 // add x1, x1, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + MOVD (R14), R6 + WORD $0xa5ef40c0 // ld1d { z0.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580404b // ldr z11, [x2] + WORD $0x8580444c // ldr z12, [x2, #1, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + MOVD 24(R14), R6 + WORD $0xa5ef40c1 // ld1d { z1.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580484b // ldr z11, [x2, #2, MUL VL] + WORD $0x85804c4c // ldr z12, [x2, #3, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + MOVD 48(R14), R6 + WORD $0xa5ef40c2 // ld1d { z2.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580504b // ldr z11, [x2, #4, MUL VL] + WORD $0x8580544c // ldr z12, [x2, #5, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + MOVD 72(R14), R6 + WORD $0xa5ef40c3 // ld1d { z3.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8580584b // ldr z11, [x2, #6, MUL VL] + WORD $0x85805c4c // ldr z12, [x2, #7, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + MOVD 96(R14), R6 + WORD $0xa5ef40c4 // ld1d { z4.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581404b // ldr z11, [x2, #8, MUL VL] + WORD $0x8581444c // ldr z12, [x2, #9, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + MOVD 120(R14), R6 + WORD $0xa5ef40c5 // ld1d { z5.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581484b // ldr z11, [x2, #10, MUL VL] + WORD $0x85814c4c // ldr z12, [x2, #11, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + MOVD 144(R14), R6 + WORD $0xa5ef40c6 // ld1d { z6.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581504b // ldr z11, [x2, #12, MUL VL] + WORD $0x8581544c // ldr z12, [x2, #13, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + MOVD 168(R14), R6 + WORD $0xa5ef40c7 // ld1d { z7.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8581584b // ldr z11, [x2, #14, MUL VL] + WORD $0x85815c4c // ldr z12, [x2, #15, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + MOVD 192(R14), R6 + WORD $0xa5ef40c8 // ld1d { z8.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582404b // ldr z11, [x2, #16, MUL VL] + WORD $0x8582444c // ldr z12, [x2, #17, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + MOVD 216(R14), R6 + WORD $0xa5ef40c9 // ld1d { z9.d }, p0/z, [x6, x15, lsl #3] + WORD $0x8582484b // ldr z11, [x2, #18, MUL VL] + WORD $0x85824c4c // ldr z12, [x2, #19, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $1, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 1 to 10 outputs + WORD $0x8580408d // ldr z13, [x4] + WORD $0x91008084 // add x4, x4, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8582504b // ldr z11, [x2, #20, MUL VL] + WORD $0x8582544c // ldr z12, [x2, #21, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8582584b // ldr z11, [x2, #22, MUL VL] + WORD $0x85825c4c // ldr z12, [x2, #23, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8583404b // ldr z11, [x2, #24, MUL VL] + WORD $0x8583444c // ldr z12, [x2, #25, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8583484b // ldr z11, [x2, #26, MUL VL] + WORD $0x85834c4c // ldr z12, [x2, #27, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8583504b // ldr z11, [x2, #28, MUL VL] + WORD $0x8583544c // ldr z12, [x2, #29, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8583584b // ldr z11, [x2, #30, MUL VL] + WORD $0x85835c4c // ldr z12, [x2, #31, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8584404b // ldr z11, [x2, #32, MUL VL] + WORD $0x8584444c // ldr z12, [x2, #33, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8584484b // ldr z11, [x2, #34, MUL VL] + WORD $0x85844c4c // ldr z12, [x2, #35, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8584504b // ldr z11, [x2, #36, MUL VL] + WORD $0x8584544c // ldr z12, [x2, #37, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8584584b // ldr z11, [x2, #38, MUL VL] + WORD $0x85845c4c // ldr z12, [x2, #39, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $2, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 2 to 10 outputs + WORD $0x858040ad // ldr z13, [x5] + WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8585404b // ldr z11, [x2, #40, MUL VL] + WORD $0x8585444c // ldr z12, [x2, #41, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8585484b // ldr z11, [x2, #42, MUL VL] + WORD $0x85854c4c // ldr z12, [x2, #43, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8585504b // ldr z11, [x2, #44, MUL VL] + WORD $0x8585544c // ldr z12, [x2, #45, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8585584b // ldr z11, [x2, #46, MUL VL] + WORD $0x85855c4c // ldr z12, [x2, #47, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8586404b // ldr z11, [x2, #48, MUL VL] + WORD $0x8586444c // ldr z12, [x2, #49, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8586484b // ldr z11, [x2, #50, MUL VL] + WORD $0x85864c4c // ldr z12, [x2, #51, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8586504b // ldr z11, [x2, #52, MUL VL] + WORD $0x8586544c // ldr z12, [x2, #53, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8586584b // ldr z11, [x2, #54, MUL VL] + WORD $0x85865c4c // ldr z12, [x2, #55, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8587404b // ldr z11, [x2, #56, MUL VL] + WORD $0x8587444c // ldr z12, [x2, #57, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8587484b // ldr z11, [x2, #58, MUL VL] + WORD $0x85874c4c // ldr z12, [x2, #59, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $3, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 3 to 10 outputs + WORD $0x8580410d // ldr z13, [x8] + WORD $0x91008108 // add x8, x8, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8587504b // ldr z11, [x2, #60, MUL VL] + WORD $0x8587544c // ldr z12, [x2, #61, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8587584b // ldr z11, [x2, #62, MUL VL] + WORD $0x85875c4c // ldr z12, [x2, #63, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8588404b // ldr z11, [x2, #64, MUL VL] + WORD $0x8588444c // ldr z12, [x2, #65, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8588484b // ldr z11, [x2, #66, MUL VL] + WORD $0x85884c4c // ldr z12, [x2, #67, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8588504b // ldr z11, [x2, #68, MUL VL] + WORD $0x8588544c // ldr z12, [x2, #69, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8588584b // ldr z11, [x2, #70, MUL VL] + WORD $0x85885c4c // ldr z12, [x2, #71, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8589404b // ldr z11, [x2, #72, MUL VL] + WORD $0x8589444c // ldr z12, [x2, #73, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8589484b // ldr z11, [x2, #74, MUL VL] + WORD $0x85894c4c // ldr z12, [x2, #75, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8589504b // ldr z11, [x2, #76, MUL VL] + WORD $0x8589544c // ldr z12, [x2, #77, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8589584b // ldr z11, [x2, #78, MUL VL] + WORD $0x85895c4c // ldr z12, [x2, #79, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $4, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 4 to 10 outputs + WORD $0x8580412d // ldr z13, [x9] + WORD $0x91008129 // add x9, x9, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858a404b // ldr z11, [x2, #80, MUL VL] + WORD $0x858a444c // ldr z12, [x2, #81, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858a484b // ldr z11, [x2, #82, MUL VL] + WORD $0x858a4c4c // ldr z12, [x2, #83, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858a504b // ldr z11, [x2, #84, MUL VL] + WORD $0x858a544c // ldr z12, [x2, #85, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858a584b // ldr z11, [x2, #86, MUL VL] + WORD $0x858a5c4c // ldr z12, [x2, #87, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858b404b // ldr z11, [x2, #88, MUL VL] + WORD $0x858b444c // ldr z12, [x2, #89, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858b484b // ldr z11, [x2, #90, MUL VL] + WORD $0x858b4c4c // ldr z12, [x2, #91, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858b504b // ldr z11, [x2, #92, MUL VL] + WORD $0x858b544c // ldr z12, [x2, #93, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858b584b // ldr z11, [x2, #94, MUL VL] + WORD $0x858b5c4c // ldr z12, [x2, #95, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858c404b // ldr z11, [x2, #96, MUL VL] + WORD $0x858c444c // ldr z12, [x2, #97, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858c484b // ldr z11, [x2, #98, MUL VL] + WORD $0x858c4c4c // ldr z12, [x2, #99, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $5, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 5 to 10 outputs + WORD $0x8580414d // ldr z13, [x10] + WORD $0x9100814a // add x10, x10, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858c504b // ldr z11, [x2, #100, MUL VL] + WORD $0x858c544c // ldr z12, [x2, #101, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858c584b // ldr z11, [x2, #102, MUL VL] + WORD $0x858c5c4c // ldr z12, [x2, #103, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858d404b // ldr z11, [x2, #104, MUL VL] + WORD $0x858d444c // ldr z12, [x2, #105, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858d484b // ldr z11, [x2, #106, MUL VL] + WORD $0x858d4c4c // ldr z12, [x2, #107, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x858d504b // ldr z11, [x2, #108, MUL VL] + WORD $0x858d544c // ldr z12, [x2, #109, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x858d584b // ldr z11, [x2, #110, MUL VL] + WORD $0x858d5c4c // ldr z12, [x2, #111, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x858e404b // ldr z11, [x2, #112, MUL VL] + WORD $0x858e444c // ldr z12, [x2, #113, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x858e484b // ldr z11, [x2, #114, MUL VL] + WORD $0x858e4c4c // ldr z12, [x2, #115, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x858e504b // ldr z11, [x2, #116, MUL VL] + WORD $0x858e544c // ldr z12, [x2, #117, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x858e584b // ldr z11, [x2, #118, MUL VL] + WORD $0x858e5c4c // ldr z12, [x2, #119, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $6, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 6 to 10 outputs + WORD $0x8580416d // ldr z13, [x11] + WORD $0x9100816b // add x11, x11, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x858f404b // ldr z11, [x2, #120, MUL VL] + WORD $0x858f444c // ldr z12, [x2, #121, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x858f484b // ldr z11, [x2, #122, MUL VL] + WORD $0x858f4c4c // ldr z12, [x2, #123, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x858f504b // ldr z11, [x2, #124, MUL VL] + WORD $0x858f544c // ldr z12, [x2, #125, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x858f584b // ldr z11, [x2, #126, MUL VL] + WORD $0x858f5c4c // ldr z12, [x2, #127, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8590404b // ldr z11, [x2, #128, MUL VL] + WORD $0x8590444c // ldr z12, [x2, #129, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8590484b // ldr z11, [x2, #130, MUL VL] + WORD $0x85904c4c // ldr z12, [x2, #131, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8590504b // ldr z11, [x2, #132, MUL VL] + WORD $0x8590544c // ldr z12, [x2, #133, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8590584b // ldr z11, [x2, #134, MUL VL] + WORD $0x85905c4c // ldr z12, [x2, #135, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8591404b // ldr z11, [x2, #136, MUL VL] + WORD $0x8591444c // ldr z12, [x2, #137, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8591484b // ldr z11, [x2, #138, MUL VL] + WORD $0x85914c4c // ldr z12, [x2, #139, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $7, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 7 to 10 outputs + WORD $0x8580418d // ldr z13, [x12] + WORD $0x9100818c // add x12, x12, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8591504b // ldr z11, [x2, #140, MUL VL] + WORD $0x8591544c // ldr z12, [x2, #141, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8591584b // ldr z11, [x2, #142, MUL VL] + WORD $0x85915c4c // ldr z12, [x2, #143, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8592404b // ldr z11, [x2, #144, MUL VL] + WORD $0x8592444c // ldr z12, [x2, #145, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8592484b // ldr z11, [x2, #146, MUL VL] + WORD $0x85924c4c // ldr z12, [x2, #147, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8592504b // ldr z11, [x2, #148, MUL VL] + WORD $0x8592544c // ldr z12, [x2, #149, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8592584b // ldr z11, [x2, #150, MUL VL] + WORD $0x85925c4c // ldr z12, [x2, #151, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8593404b // ldr z11, [x2, #152, MUL VL] + WORD $0x8593444c // ldr z12, [x2, #153, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8593484b // ldr z11, [x2, #154, MUL VL] + WORD $0x85934c4c // ldr z12, [x2, #155, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8593504b // ldr z11, [x2, #156, MUL VL] + WORD $0x8593544c // ldr z12, [x2, #157, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8593584b // ldr z11, [x2, #158, MUL VL] + WORD $0x85935c4c // ldr z12, [x2, #159, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $8, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 8 to 10 outputs + WORD $0x858041ad // ldr z13, [x13] + WORD $0x910081ad // add x13, x13, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8594404b // ldr z11, [x2, #160, MUL VL] + WORD $0x8594444c // ldr z12, [x2, #161, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8594484b // ldr z11, [x2, #162, MUL VL] + WORD $0x85944c4c // ldr z12, [x2, #163, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8594504b // ldr z11, [x2, #164, MUL VL] + WORD $0x8594544c // ldr z12, [x2, #165, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8594584b // ldr z11, [x2, #166, MUL VL] + WORD $0x85945c4c // ldr z12, [x2, #167, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8595404b // ldr z11, [x2, #168, MUL VL] + WORD $0x8595444c // ldr z12, [x2, #169, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8595484b // ldr z11, [x2, #170, MUL VL] + WORD $0x85954c4c // ldr z12, [x2, #171, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8595504b // ldr z11, [x2, #172, MUL VL] + WORD $0x8595544c // ldr z12, [x2, #173, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8595584b // ldr z11, [x2, #174, MUL VL] + WORD $0x85955c4c // ldr z12, [x2, #175, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8596404b // ldr z11, [x2, #176, MUL VL] + WORD $0x8596444c // ldr z12, [x2, #177, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8596484b // ldr z11, [x2, #178, MUL VL] + WORD $0x85964c4c // ldr z12, [x2, #179, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + // Check for early termination + CMP $9, R16 + BEQ mulSve_10x10Xor_store + + // Load and process 32 bytes from input 9 to 10 outputs + WORD $0x8580406d // ldr z13, [x3] + WORD $0x91008063 // add x3, x3, #32 + WORD $0x04fc95ae // lsr z14.d, z13.d, #4 + WORD $0x042a31ad // and z13.d, z13.d, z10.d + WORD $0x042a31ce // and z14.d, z14.d, z10.d + WORD $0x8596504b // ldr z11, [x2, #180, MUL VL] + WORD $0x8596544c // ldr z12, [x2, #181, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3000 // eor z0.d, z0.d, z11.d + WORD $0x04ac3000 // eor z0.d, z0.d, z12.d + WORD $0x8596584b // ldr z11, [x2, #182, MUL VL] + WORD $0x85965c4c // ldr z12, [x2, #183, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3021 // eor z1.d, z1.d, z11.d + WORD $0x04ac3021 // eor z1.d, z1.d, z12.d + WORD $0x8597404b // ldr z11, [x2, #184, MUL VL] + WORD $0x8597444c // ldr z12, [x2, #185, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3042 // eor z2.d, z2.d, z11.d + WORD $0x04ac3042 // eor z2.d, z2.d, z12.d + WORD $0x8597484b // ldr z11, [x2, #186, MUL VL] + WORD $0x85974c4c // ldr z12, [x2, #187, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3063 // eor z3.d, z3.d, z11.d + WORD $0x04ac3063 // eor z3.d, z3.d, z12.d + WORD $0x8597504b // ldr z11, [x2, #188, MUL VL] + WORD $0x8597544c // ldr z12, [x2, #189, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3084 // eor z4.d, z4.d, z11.d + WORD $0x04ac3084 // eor z4.d, z4.d, z12.d + WORD $0x8597584b // ldr z11, [x2, #190, MUL VL] + WORD $0x85975c4c // ldr z12, [x2, #191, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30a5 // eor z5.d, z5.d, z11.d + WORD $0x04ac30a5 // eor z5.d, z5.d, z12.d + WORD $0x8598404b // ldr z11, [x2, #192, MUL VL] + WORD $0x8598444c // ldr z12, [x2, #193, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30c6 // eor z6.d, z6.d, z11.d + WORD $0x04ac30c6 // eor z6.d, z6.d, z12.d + WORD $0x8598484b // ldr z11, [x2, #194, MUL VL] + WORD $0x85984c4c // ldr z12, [x2, #195, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab30e7 // eor z7.d, z7.d, z11.d + WORD $0x04ac30e7 // eor z7.d, z7.d, z12.d + WORD $0x8598504b // ldr z11, [x2, #196, MUL VL] + WORD $0x8598544c // ldr z12, [x2, #197, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3108 // eor z8.d, z8.d, z11.d + WORD $0x04ac3108 // eor z8.d, z8.d, z12.d + WORD $0x8598584b // ldr z11, [x2, #198, MUL VL] + WORD $0x85985c4c // ldr z12, [x2, #199, MUL VL] + WORD $0x052d316b // tbl z11.b, z11.b, z13.b + WORD $0x052e318c // tbl z12.b, z12.b, z14.b + WORD $0x04ab3129 // eor z9.d, z9.d, z11.d + WORD $0x04ac3129 // eor z9.d, z9.d, z12.d + +mulSve_10x10Xor_store: + // Store 10 outputs + MOVD (R14), R6 + WORD $0xe5ef40c0 // st1d { z0.d }, p0, [x6, x15, lsl #3] + MOVD 24(R14), R6 + WORD $0xe5ef40c1 // st1d { z1.d }, p0, [x6, x15, lsl #3] + MOVD 48(R14), R6 + WORD $0xe5ef40c2 // st1d { z2.d }, p0, [x6, x15, lsl #3] + MOVD 72(R14), R6 + WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] + MOVD 96(R14), R6 + WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] + MOVD 120(R14), R6 + WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] + MOVD 144(R14), R6 + WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] + MOVD 168(R14), R6 + WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] + MOVD 192(R14), R6 + WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] + MOVD 216(R14), R6 + WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] + + // Prepare for next loop + WORD $0x910011ef // add x15, x15, #4 + WORD $0xf1000400 // subs x0, x0, #1 + BNE mulSve_10x10Xor_loop + +mulSve_10x10Xor_end: + RET + +// func mulNeon_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x1_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + ADD R15, R14 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + MOVD $15, R15 + VMOV R15, V4.B[0] + VDUP V4.B[0], V4.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x1_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 1 outputs + VLD1.P 32(R1), [V12.B16, V13.B16] + VLD1.P 32(R1), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V8.B16, V0.B16 + VEOR V7.B16, V9.B16, V1.B16 + VEOR V10.B16, V12.B16, V2.B16 + VEOR V11.B16, V13.B16, V3.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 1 to 1 outputs + VLD1.P 32(R4), [V12.B16, V13.B16] + VLD1.P 32(R4), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 2 to 1 outputs + VLD1.P 32(R5), [V12.B16, V13.B16] + VLD1.P 32(R5), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 3 to 1 outputs + VLD1.P 32(R8), [V12.B16, V13.B16] + VLD1.P 32(R8), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 4 to 1 outputs + VLD1.P 32(R9), [V12.B16, V13.B16] + VLD1.P 32(R9), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 5 to 1 outputs + VLD1.P 32(R10), [V12.B16, V13.B16] + VLD1.P 32(R10), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 6 to 1 outputs + VLD1.P 32(R11), [V12.B16, V13.B16] + VLD1.P 32(R11), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 7 to 1 outputs + VLD1.P 32(R12), [V12.B16, V13.B16] + VLD1.P 32(R12), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 8 to 1 outputs + VLD1.P 32(R13), [V12.B16, V13.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x1_64_store + + // Load and process 64 bytes from input 9 to 1 outputs + VLD1.P 32(R3), [V12.B16, V13.B16] + VLD1.P 32(R3), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + +mulNeon_10x1_64_store: + // Store 1 outputs + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x1_64_loop + +mulNeon_10x1_64_end: + RET + +// func mulNeon_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x1_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R14 + MOVD start+72(FP), R15 + + // Add start offset to output + ADD R15, R14 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + MOVD $15, R15 + VMOV R15, V4.B[0] + VDUP V4.B[0], V4.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x1_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 1 outputs + VLD1.P 32(R14), [V0.B16, V1.B16] + VLD1.P 32(R14), [V2.B16, V3.B16] + + // Load and process 64 bytes from input 0 to 1 outputs + VLD1.P 32(R1), [V12.B16, V13.B16] + VLD1.P 32(R1), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 1 to 1 outputs + VLD1.P 32(R4), [V12.B16, V13.B16] + VLD1.P 32(R4), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 2 to 1 outputs + VLD1.P 32(R5), [V12.B16, V13.B16] + VLD1.P 32(R5), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 3 to 1 outputs + VLD1.P 32(R8), [V12.B16, V13.B16] + VLD1.P 32(R8), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 4 to 1 outputs + VLD1.P 32(R9), [V12.B16, V13.B16] + VLD1.P 32(R9), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 5 to 1 outputs + VLD1.P 32(R10), [V12.B16, V13.B16] + VLD1.P 32(R10), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 6 to 1 outputs + VLD1.P 32(R11), [V12.B16, V13.B16] + VLD1.P 32(R11), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 7 to 1 outputs + VLD1.P 32(R12), [V12.B16, V13.B16] + VLD1.P 32(R12), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 8 to 1 outputs + VLD1.P 32(R13), [V12.B16, V13.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x1_64Xor_store + + // Load and process 64 bytes from input 9 to 1 outputs + VLD1.P 32(R3), [V12.B16, V13.B16] + VLD1.P 32(R3), [V10.B16, V11.B16] + VUSHR $4, V12.B16, V14.B16 + VUSHR $4, V13.B16, V15.B16 + VUSHR $4, V10.B16, V16.B16 + VUSHR $4, V11.B16, V17.B16 + VAND V4.B16, V12.B16, V12.B16 + VAND V4.B16, V13.B16, V13.B16 + VAND V4.B16, V10.B16, V10.B16 + VAND V4.B16, V11.B16, V11.B16 + VAND V4.B16, V14.B16, V14.B16 + VAND V4.B16, V15.B16, V15.B16 + VAND V4.B16, V16.B16, V16.B16 + VAND V4.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V6.B16, V7.B16] + VLD1.P 32(R2), [V8.B16, V9.B16] + VTBL V10.B16, [V6.B16], V10.B16 + VTBL V11.B16, [V7.B16], V11.B16 + VTBL V12.B16, [V6.B16], V6.B16 + VTBL V13.B16, [V7.B16], V7.B16 + VTBL V16.B16, [V8.B16], V12.B16 + VTBL V17.B16, [V9.B16], V13.B16 + VTBL V14.B16, [V8.B16], V8.B16 + VTBL V15.B16, [V9.B16], V9.B16 + VEOR V6.B16, V0.B16, V0.B16 + VEOR V7.B16, V1.B16, V1.B16 + VEOR V8.B16, V0.B16, V0.B16 + VEOR V9.B16, V1.B16, V1.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + +mulNeon_10x1_64Xor_store: + // Store 1 outputs + SUB $64, R14 + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x1_64Xor_loop + +mulNeon_10x1_64Xor_end: + RET + +// func mulNeon_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x2_64_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R15 + ADD R6, R14 + + // Add start offset to input + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R13 + ADD R6, R3 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x2_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 2 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V0.B16 + VEOR V11.B16, V13.B16, V1.B16 + VEOR V14.B16, V16.B16, V2.B16 + VEOR V15.B16, V17.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V4.B16 + VEOR V11.B16, V13.B16, V5.B16 + VEOR V14.B16, V16.B16, V6.B16 + VEOR V15.B16, V17.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 1 to 2 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 2 to 2 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 3 to 2 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 4 to 2 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 5 to 2 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 6 to 2 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 7 to 2 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 8 to 2 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x2_64_store + + // Load and process 64 bytes from input 9 to 2 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + +mulNeon_10x2_64_store: + // Store 2 outputs + VST1.P [V0.D2, V1.D2], 32(R15) + VST1.P [V2.D2, V3.D2], 32(R15) + VST1.P [V4.D2, V5.D2], 32(R14) + VST1.P [V6.D2, V7.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x2_64_loop + +mulNeon_10x2_64_end: + RET + +// func mulNeon_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x2_64Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD (R14), R15 + MOVD 24(R14), R14 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R15 + ADD R6, R14 + + // Add start offset to input + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R13 + ADD R6, R3 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x2_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 2 outputs + VLD1.P 32(R15), [V0.B16, V1.B16] + VLD1.P 32(R15), [V2.B16, V3.B16] + VLD1.P 32(R14), [V4.B16, V5.B16] + VLD1.P 32(R14), [V6.B16, V7.B16] + + // Load and process 64 bytes from input 0 to 2 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 1 to 2 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 2 to 2 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 3 to 2 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 4 to 2 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 5 to 2 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 6 to 2 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 7 to 2 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 8 to 2 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x2_64Xor_store + + // Load and process 64 bytes from input 9 to 2 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V8.B16, V18.B16, V18.B16 + VAND V8.B16, V19.B16, V19.B16 + VAND V8.B16, V22.B16, V22.B16 + VAND V8.B16, V23.B16, V23.B16 + VAND V8.B16, V20.B16, V20.B16 + VAND V8.B16, V21.B16, V21.B16 + VAND V8.B16, V24.B16, V24.B16 + VAND V8.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V22.B16, [V10.B16], V14.B16 + VTBL V23.B16, [V11.B16], V15.B16 + VTBL V18.B16, [V10.B16], V10.B16 + VTBL V19.B16, [V11.B16], V11.B16 + VTBL V24.B16, [V12.B16], V16.B16 + VTBL V25.B16, [V13.B16], V17.B16 + VTBL V20.B16, [V12.B16], V12.B16 + VTBL V21.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + +mulNeon_10x2_64Xor_store: + // Store 2 outputs + SUB $64, R15 + VST1.P [V0.D2, V1.D2], 32(R15) + VST1.P [V2.D2, V3.D2], 32(R15) + SUB $64, R14 + VST1.P [V4.D2, V5.D2], 32(R14) + VST1.P [V6.D2, V7.D2], 32(R14) + + // Prepare for next loop + SUBS $1, R0 + BNE mulNeon_10x2_64Xor_loop + +mulNeon_10x2_64Xor_end: + RET + +// func mulNeon_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x3_64_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R14 + ADD R6, R15 + ADD R6, R13 + + // Add start offset to input + ADD R6, R3 + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R0 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Reload length to save a register + MOVD n+80(FP), R6 + LSR $6, R6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x3_64_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 64 bytes from input 0 to 3 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V0.B16 + VEOR V15.B16, V17.B16, V1.B16 + VEOR V18.B16, V20.B16, V2.B16 + VEOR V19.B16, V21.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V4.B16 + VEOR V15.B16, V17.B16, V5.B16 + VEOR V18.B16, V20.B16, V6.B16 + VEOR V19.B16, V21.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V8.B16 + VEOR V15.B16, V17.B16, V9.B16 + VEOR V18.B16, V20.B16, V10.B16 + VEOR V19.B16, V21.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 1 to 3 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 2 to 3 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 3 to 3 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 4 to 3 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 5 to 3 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 6 to 3 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 7 to 3 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 8 to 3 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x3_64_store + + // Load and process 64 bytes from input 9 to 3 outputs + VLD1.P 32(R0), [V22.B16, V23.B16] + VLD1.P 32(R0), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + +mulNeon_10x3_64_store: + // Store 3 outputs + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + VST1.P [V4.D2, V5.D2], 32(R15) + VST1.P [V6.D2, V7.D2], 32(R15) + VST1.P [V8.D2, V9.D2], 32(R13) + VST1.P [V10.D2, V11.D2], 32(R13) + + // Prepare for next loop + SUBS $1, R6 + BNE mulNeon_10x3_64_loop + +mulNeon_10x3_64_end: + RET + +// func mulNeon_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVD n+80(FP), R0 + LSR $6, R0 + TST R0, R0 + BEQ mulNeon_10x3_64Xor_end + MOVD in_base+24(FP), R0 + MOVD (R0), R3 + MOVD 24(R0), R1 + MOVD 48(R0), R4 + MOVD 72(R0), R5 + MOVD 96(R0), R8 + MOVD 120(R0), R9 + MOVD 144(R0), R10 + MOVD 168(R0), R11 + MOVD 192(R0), R12 + MOVD 216(R0), R0 + MOVD out_base+48(FP), R13 + MOVD (R13), R14 + MOVD 24(R13), R15 + MOVD 48(R13), R13 + MOVD start+72(FP), R6 + + // Add start offset to output + ADD R6, R14 + ADD R6, R15 + ADD R6, R13 + + // Add start offset to input + ADD R6, R3 + ADD R6, R1 + ADD R6, R4 + ADD R6, R5 + ADD R6, R8 + ADD R6, R9 + ADD R6, R10 + ADD R6, R11 + ADD R6, R12 + ADD R6, R0 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Reload length to save a register + MOVD n+80(FP), R6 + LSR $6, R6 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x3_64Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load 3 outputs + VLD1.P 32(R14), [V0.B16, V1.B16] + VLD1.P 32(R14), [V2.B16, V3.B16] + VLD1.P 32(R15), [V4.B16, V5.B16] + VLD1.P 32(R15), [V6.B16, V7.B16] + VLD1.P 32(R13), [V8.B16, V9.B16] + VLD1.P 32(R13), [V10.B16, V11.B16] + + // Load and process 64 bytes from input 0 to 3 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 1 to 3 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 2 to 3 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 3 to 3 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 4 to 3 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 5 to 3 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 6 to 3 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 7 to 3 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 8 to 3 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x3_64Xor_store + + // Load and process 64 bytes from input 9 to 3 outputs + VLD1.P 32(R0), [V22.B16, V23.B16] + VLD1.P 32(R0), [V26.B16, V27.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V12.B16, V22.B16, V22.B16 + VAND V12.B16, V23.B16, V23.B16 + VAND V12.B16, V26.B16, V26.B16 + VAND V12.B16, V27.B16, V27.B16 + VAND V12.B16, V24.B16, V24.B16 + VAND V12.B16, V25.B16, V25.B16 + VAND V12.B16, V28.B16, V28.B16 + VAND V12.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V26.B16, [V14.B16], V18.B16 + VTBL V27.B16, [V15.B16], V19.B16 + VTBL V22.B16, [V14.B16], V14.B16 + VTBL V23.B16, [V15.B16], V15.B16 + VTBL V28.B16, [V16.B16], V20.B16 + VTBL V29.B16, [V17.B16], V21.B16 + VTBL V24.B16, [V16.B16], V16.B16 + VTBL V25.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + +mulNeon_10x3_64Xor_store: + // Store 3 outputs + SUB $64, R14 + VST1.P [V0.D2, V1.D2], 32(R14) + VST1.P [V2.D2, V3.D2], 32(R14) + SUB $64, R15 + VST1.P [V4.D2, V5.D2], 32(R15) + VST1.P [V6.D2, V7.D2], 32(R15) + SUB $64, R13 + VST1.P [V8.D2, V9.D2], 32(R13) + VST1.P [V10.D2, V11.D2], 32(R13) + + // Prepare for next loop + SUBS $1, R6 + BNE mulNeon_10x3_64Xor_loop + +mulNeon_10x3_64Xor_end: + RET + +// func mulNeon_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x4_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x4_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 4 outputs + VLD1.P 32(R1), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V0.B16 + VEOR V11.B16, V13.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V2.B16 + VEOR V11.B16, V13.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V4.B16 + VEOR V11.B16, V13.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V12.B16, V6.B16 + VEOR V11.B16, V13.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 1 to 4 outputs + VLD1.P 32(R4), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 2 to 4 outputs + VLD1.P 32(R5), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 3 to 4 outputs + VLD1.P 32(R8), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 4 to 4 outputs + VLD1.P 32(R9), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 5 to 4 outputs + VLD1.P 32(R10), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 6 to 4 outputs + VLD1.P 32(R11), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 7 to 4 outputs + VLD1.P 32(R12), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 8 to 4 outputs + VLD1.P 32(R13), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x4_store + + // Load and process 32 bytes from input 9 to 4 outputs + VLD1.P 32(R3), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + +mulNeon_10x4_store: + // Store 4 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x4_loop + +mulNeon_10x4_end: + RET + +// func mulNeon_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x4Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V8.B[0] + VDUP V8.B[0], V8.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x4Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 4 outputs + VLD1.P 32(R1), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 1 to 4 outputs + VLD1.P 32(R4), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 2 to 4 outputs + VLD1.P 32(R5), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 3 to 4 outputs + VLD1.P 32(R8), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 4 to 4 outputs + VLD1.P 32(R9), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 5 to 4 outputs + VLD1.P 32(R10), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 6 to 4 outputs + VLD1.P 32(R11), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 7 to 4 outputs + VLD1.P 32(R12), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 8 to 4 outputs + VLD1.P 32(R13), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x4Xor_store + + // Load and process 32 bytes from input 9 to 4 outputs + VLD1.P 32(R3), [V14.B16, V15.B16] + VUSHR $4, V14.B16, V16.B16 + VUSHR $4, V15.B16, V17.B16 + VAND V8.B16, V14.B16, V14.B16 + VAND V8.B16, V15.B16, V15.B16 + VAND V8.B16, V16.B16, V16.B16 + VAND V8.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V0.B16, V0.B16 + VEOR V11.B16, V1.B16, V1.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V2.B16, V2.B16 + VEOR V11.B16, V3.B16, V3.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V4.B16, V4.B16 + VEOR V11.B16, V5.B16, V5.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V10.B16, V11.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VTBL V14.B16, [V10.B16], V10.B16 + VTBL V15.B16, [V11.B16], V11.B16 + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VEOR V10.B16, V6.B16, V6.B16 + VEOR V11.B16, V7.B16, V7.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + +mulNeon_10x4Xor_store: + // Store 4 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x4Xor_loop + +mulNeon_10x4Xor_end: + RET + +// func mulNeon_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x5_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V10.B[0] + VDUP V10.B[0], V10.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x5_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 5 outputs + VLD1.P 32(R1), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V0.B16 + VEOR V13.B16, V15.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V2.B16 + VEOR V13.B16, V15.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V4.B16 + VEOR V13.B16, V15.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V6.B16 + VEOR V13.B16, V15.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V14.B16, V8.B16 + VEOR V13.B16, V15.B16, V9.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 1 to 5 outputs + VLD1.P 32(R4), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 2 to 5 outputs + VLD1.P 32(R5), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 3 to 5 outputs + VLD1.P 32(R8), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 4 to 5 outputs + VLD1.P 32(R9), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 5 to 5 outputs + VLD1.P 32(R10), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 6 to 5 outputs + VLD1.P 32(R11), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 7 to 5 outputs + VLD1.P 32(R12), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 8 to 5 outputs + VLD1.P 32(R13), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x5_store + + // Load and process 32 bytes from input 9 to 5 outputs + VLD1.P 32(R3), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + +mulNeon_10x5_store: + // Store 5 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x5_loop + +mulNeon_10x5_end: + RET + +// func mulNeon_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x5Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V10.B[0] + VDUP V10.B[0], V10.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x5Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 5 outputs + VLD1.P 32(R1), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 1 to 5 outputs + VLD1.P 32(R4), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 2 to 5 outputs + VLD1.P 32(R5), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 3 to 5 outputs + VLD1.P 32(R8), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 4 to 5 outputs + VLD1.P 32(R9), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 5 to 5 outputs + VLD1.P 32(R10), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 6 to 5 outputs + VLD1.P 32(R11), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 7 to 5 outputs + VLD1.P 32(R12), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 8 to 5 outputs + VLD1.P 32(R13), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x5Xor_store + + // Load and process 32 bytes from input 9 to 5 outputs + VLD1.P 32(R3), [V16.B16, V17.B16] + VUSHR $4, V16.B16, V18.B16 + VUSHR $4, V17.B16, V19.B16 + VAND V10.B16, V16.B16, V16.B16 + VAND V10.B16, V17.B16, V17.B16 + VAND V10.B16, V18.B16, V18.B16 + VAND V10.B16, V19.B16, V19.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V0.B16, V0.B16 + VEOR V13.B16, V1.B16, V1.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V2.B16, V2.B16 + VEOR V13.B16, V3.B16, V3.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V4.B16, V4.B16 + VEOR V13.B16, V5.B16, V5.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V6.B16, V6.B16 + VEOR V13.B16, V7.B16, V7.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V12.B16, V13.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VTBL V16.B16, [V12.B16], V12.B16 + VTBL V17.B16, [V13.B16], V13.B16 + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VEOR V12.B16, V8.B16, V8.B16 + VEOR V13.B16, V9.B16, V9.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + +mulNeon_10x5Xor_store: + // Store 5 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x5Xor_loop + +mulNeon_10x5Xor_end: + RET + +// func mulNeon_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x6_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x6_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 6 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V0.B16 + VEOR V15.B16, V17.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V2.B16 + VEOR V15.B16, V17.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V4.B16 + VEOR V15.B16, V17.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V6.B16 + VEOR V15.B16, V17.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V8.B16 + VEOR V15.B16, V17.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V16.B16, V10.B16 + VEOR V15.B16, V17.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 1 to 6 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 2 to 6 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 3 to 6 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 4 to 6 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 5 to 6 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 6 to 6 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 7 to 6 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 8 to 6 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x6_store + + // Load and process 32 bytes from input 9 to 6 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + +mulNeon_10x6_store: + // Store 6 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x6_loop + +mulNeon_10x6_end: + RET + +// func mulNeon_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x6Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V12.B[0] + VDUP V12.B[0], V12.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x6Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 6 outputs + VLD1.P 32(R1), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 1 to 6 outputs + VLD1.P 32(R4), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 2 to 6 outputs + VLD1.P 32(R5), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 3 to 6 outputs + VLD1.P 32(R8), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 4 to 6 outputs + VLD1.P 32(R9), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 5 to 6 outputs + VLD1.P 32(R10), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 6 to 6 outputs + VLD1.P 32(R11), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 7 to 6 outputs + VLD1.P 32(R12), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 8 to 6 outputs + VLD1.P 32(R13), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x6Xor_store + + // Load and process 32 bytes from input 9 to 6 outputs + VLD1.P 32(R3), [V18.B16, V19.B16] + VUSHR $4, V18.B16, V20.B16 + VUSHR $4, V19.B16, V21.B16 + VAND V12.B16, V18.B16, V18.B16 + VAND V12.B16, V19.B16, V19.B16 + VAND V12.B16, V20.B16, V20.B16 + VAND V12.B16, V21.B16, V21.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V0.B16, V0.B16 + VEOR V15.B16, V1.B16, V1.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V2.B16, V2.B16 + VEOR V15.B16, V3.B16, V3.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V4.B16, V4.B16 + VEOR V15.B16, V5.B16, V5.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V6.B16, V6.B16 + VEOR V15.B16, V7.B16, V7.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V8.B16, V8.B16 + VEOR V15.B16, V9.B16, V9.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V14.B16, V15.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VTBL V18.B16, [V14.B16], V14.B16 + VTBL V19.B16, [V15.B16], V15.B16 + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VEOR V14.B16, V10.B16, V10.B16 + VEOR V15.B16, V11.B16, V11.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + +mulNeon_10x6Xor_store: + // Store 6 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x6Xor_loop + +mulNeon_10x6Xor_end: + RET + +// func mulNeon_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x7_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V14.B[0] + VDUP V14.B[0], V14.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x7_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 7 outputs + VLD1.P 32(R1), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V0.B16 + VEOR V17.B16, V19.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V2.B16 + VEOR V17.B16, V19.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V4.B16 + VEOR V17.B16, V19.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V6.B16 + VEOR V17.B16, V19.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V8.B16 + VEOR V17.B16, V19.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V10.B16 + VEOR V17.B16, V19.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V18.B16, V12.B16 + VEOR V17.B16, V19.B16, V13.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 1 to 7 outputs + VLD1.P 32(R4), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 2 to 7 outputs + VLD1.P 32(R5), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 3 to 7 outputs + VLD1.P 32(R8), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 4 to 7 outputs + VLD1.P 32(R9), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 5 to 7 outputs + VLD1.P 32(R10), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 6 to 7 outputs + VLD1.P 32(R11), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 7 to 7 outputs + VLD1.P 32(R12), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 8 to 7 outputs + VLD1.P 32(R13), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x7_store + + // Load and process 32 bytes from input 9 to 7 outputs + VLD1.P 32(R3), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + +mulNeon_10x7_store: + // Store 7 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x7_loop + +mulNeon_10x7_end: + RET + +// func mulNeon_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x7Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V14.B[0] + VDUP V14.B[0], V14.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x7Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 7 outputs + VLD1.P 32(R1), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 1 to 7 outputs + VLD1.P 32(R4), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 2 to 7 outputs + VLD1.P 32(R5), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 3 to 7 outputs + VLD1.P 32(R8), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 4 to 7 outputs + VLD1.P 32(R9), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 5 to 7 outputs + VLD1.P 32(R10), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 6 to 7 outputs + VLD1.P 32(R11), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 7 to 7 outputs + VLD1.P 32(R12), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 8 to 7 outputs + VLD1.P 32(R13), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x7Xor_store + + // Load and process 32 bytes from input 9 to 7 outputs + VLD1.P 32(R3), [V20.B16, V21.B16] + VUSHR $4, V20.B16, V22.B16 + VUSHR $4, V21.B16, V23.B16 + VAND V14.B16, V20.B16, V20.B16 + VAND V14.B16, V21.B16, V21.B16 + VAND V14.B16, V22.B16, V22.B16 + VAND V14.B16, V23.B16, V23.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V0.B16, V0.B16 + VEOR V17.B16, V1.B16, V1.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V2.B16, V2.B16 + VEOR V17.B16, V3.B16, V3.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V4.B16, V4.B16 + VEOR V17.B16, V5.B16, V5.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V6.B16, V6.B16 + VEOR V17.B16, V7.B16, V7.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V8.B16, V8.B16 + VEOR V17.B16, V9.B16, V9.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V10.B16, V10.B16 + VEOR V17.B16, V11.B16, V11.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V16.B16, V17.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VTBL V20.B16, [V16.B16], V16.B16 + VTBL V21.B16, [V17.B16], V17.B16 + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VEOR V16.B16, V12.B16, V12.B16 + VEOR V17.B16, V13.B16, V13.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + +mulNeon_10x7Xor_store: + // Store 7 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x7Xor_loop + +mulNeon_10x7Xor_end: + RET + +// func mulNeon_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x8_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V16.B[0] + VDUP V16.B[0], V16.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x8_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 8 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V0.B16 + VEOR V19.B16, V21.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V2.B16 + VEOR V19.B16, V21.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V4.B16 + VEOR V19.B16, V21.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V6.B16 + VEOR V19.B16, V21.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V8.B16 + VEOR V19.B16, V21.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V10.B16 + VEOR V19.B16, V21.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V12.B16 + VEOR V19.B16, V21.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V20.B16, V14.B16 + VEOR V19.B16, V21.B16, V15.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 1 to 8 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 2 to 8 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 3 to 8 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 4 to 8 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 5 to 8 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 6 to 8 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 7 to 8 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 8 to 8 outputs + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x8_store + + // Load and process 32 bytes from input 9 to 8 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + +mulNeon_10x8_store: + // Store 8 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x8_loop + +mulNeon_10x8_end: + RET + +// func mulNeon_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x8Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V16.B[0] + VDUP V16.B[0], V16.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x8Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 8 outputs + VLD1.P 32(R1), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 1 to 8 outputs + VLD1.P 32(R4), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 2 to 8 outputs + VLD1.P 32(R5), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 3 to 8 outputs + VLD1.P 32(R8), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 4 to 8 outputs + VLD1.P 32(R9), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 5 to 8 outputs + VLD1.P 32(R10), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 6 to 8 outputs + VLD1.P 32(R11), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 7 to 8 outputs + VLD1.P 32(R12), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 8 to 8 outputs + VLD1.P 32(R13), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x8Xor_store + + // Load and process 32 bytes from input 9 to 8 outputs + VLD1.P 32(R3), [V22.B16, V23.B16] + VUSHR $4, V22.B16, V24.B16 + VUSHR $4, V23.B16, V25.B16 + VAND V16.B16, V22.B16, V22.B16 + VAND V16.B16, V23.B16, V23.B16 + VAND V16.B16, V24.B16, V24.B16 + VAND V16.B16, V25.B16, V25.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V0.B16, V0.B16 + VEOR V19.B16, V1.B16, V1.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V2.B16, V2.B16 + VEOR V19.B16, V3.B16, V3.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V4.B16, V4.B16 + VEOR V19.B16, V5.B16, V5.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V6.B16, V6.B16 + VEOR V19.B16, V7.B16, V7.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V8.B16, V8.B16 + VEOR V19.B16, V9.B16, V9.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V10.B16, V10.B16 + VEOR V19.B16, V11.B16, V11.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V12.B16, V12.B16 + VEOR V19.B16, V13.B16, V13.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V18.B16, V19.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VTBL V22.B16, [V18.B16], V18.B16 + VTBL V23.B16, [V19.B16], V19.B16 + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VEOR V18.B16, V14.B16, V14.B16 + VEOR V19.B16, V15.B16, V15.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + +mulNeon_10x8Xor_store: + // Store 8 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x8Xor_loop + +mulNeon_10x8Xor_end: + RET + +// func mulNeon_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x9_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V18.B[0] + VDUP V18.B[0], V18.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x9_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 9 outputs + VLD1.P 32(R1), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V0.B16 + VEOR V21.B16, V23.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V2.B16 + VEOR V21.B16, V23.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V4.B16 + VEOR V21.B16, V23.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V6.B16 + VEOR V21.B16, V23.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V8.B16 + VEOR V21.B16, V23.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V10.B16 + VEOR V21.B16, V23.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V12.B16 + VEOR V21.B16, V23.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V14.B16 + VEOR V21.B16, V23.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V22.B16, V16.B16 + VEOR V21.B16, V23.B16, V17.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 1 to 9 outputs + VLD1.P 32(R4), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 2 to 9 outputs + VLD1.P 32(R5), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 3 to 9 outputs + VLD1.P 32(R8), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 4 to 9 outputs + VLD1.P 32(R9), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 5 to 9 outputs + VLD1.P 32(R10), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 6 to 9 outputs + VLD1.P 32(R11), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 7 to 9 outputs + VLD1.P 32(R12), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 8 to 9 outputs + VLD1.P 32(R13), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x9_store + + // Load and process 32 bytes from input 9 to 9 outputs + VLD1.P 32(R3), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + +mulNeon_10x9_store: + // Store 9 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x9_loop + +mulNeon_10x9_end: + RET + +// func mulNeon_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x9Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V18.B[0] + VDUP V18.B[0], V18.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x9Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 9 outputs + VLD1.P 32(R1), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + MOVD 192(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V16.B16, V17.B16] + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 1 to 9 outputs + VLD1.P 32(R4), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 2 to 9 outputs + VLD1.P 32(R5), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 3 to 9 outputs + VLD1.P 32(R8), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 4 to 9 outputs + VLD1.P 32(R9), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 5 to 9 outputs + VLD1.P 32(R10), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 6 to 9 outputs + VLD1.P 32(R11), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 7 to 9 outputs + VLD1.P 32(R12), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 8 to 9 outputs + VLD1.P 32(R13), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x9Xor_store + + // Load and process 32 bytes from input 9 to 9 outputs + VLD1.P 32(R3), [V24.B16, V25.B16] + VUSHR $4, V24.B16, V26.B16 + VUSHR $4, V25.B16, V27.B16 + VAND V18.B16, V24.B16, V24.B16 + VAND V18.B16, V25.B16, V25.B16 + VAND V18.B16, V26.B16, V26.B16 + VAND V18.B16, V27.B16, V27.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V0.B16, V0.B16 + VEOR V21.B16, V1.B16, V1.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V2.B16, V2.B16 + VEOR V21.B16, V3.B16, V3.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V6.B16, V6.B16 + VEOR V21.B16, V7.B16, V7.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V8.B16, V8.B16 + VEOR V21.B16, V9.B16, V9.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V10.B16, V10.B16 + VEOR V21.B16, V11.B16, V11.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V12.B16, V12.B16 + VEOR V21.B16, V13.B16, V13.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V14.B16, V14.B16 + VEOR V21.B16, V15.B16, V15.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V20.B16, V21.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VTBL V24.B16, [V20.B16], V20.B16 + VTBL V25.B16, [V21.B16], V21.B16 + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VEOR V20.B16, V16.B16, V16.B16 + VEOR V21.B16, V17.B16, V17.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + +mulNeon_10x9Xor_store: + // Store 9 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x9Xor_loop + +mulNeon_10x9Xor_end: + RET + +// func mulNeon_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x10_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V20.B[0] + VDUP V20.B[0], V20.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x10_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 10 outputs + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V0.B16 + VEOR V23.B16, V25.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V2.B16 + VEOR V23.B16, V25.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V4.B16 + VEOR V23.B16, V25.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V6.B16 + VEOR V23.B16, V25.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V8.B16 + VEOR V23.B16, V25.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V10.B16 + VEOR V23.B16, V25.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V12.B16 + VEOR V23.B16, V25.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V14.B16 + VEOR V23.B16, V25.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V16.B16 + VEOR V23.B16, V25.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V24.B16, V18.B16 + VEOR V23.B16, V25.B16, V19.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 1 to 10 outputs + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 2 to 10 outputs + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 3 to 10 outputs + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 4 to 10 outputs + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 5 to 10 outputs + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 6 to 10 outputs + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 7 to 10 outputs + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 8 to 10 outputs + VLD1.P 32(R13), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x10_store + + // Load and process 32 bytes from input 9 to 10 outputs + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + +mulNeon_10x10_store: + // Store 10 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + MOVD 216(R14), R6 + ADD R15<<3, R6 + VST1 [V18.D2, V19.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x10_loop + +mulNeon_10x10_end: + RET + +// func mulNeon_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: NEON +TEXT ·mulNeon_10x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVD n+80(FP), R0 + LSR $5, R0 + TST R0, R0 + BEQ mulNeon_10x10Xor_end + MOVD in_base+24(FP), R3 + MOVD (R3), R1 + MOVD 24(R3), R4 + MOVD 48(R3), R5 + MOVD 72(R3), R8 + MOVD 96(R3), R9 + MOVD 120(R3), R10 + MOVD 144(R3), R11 + MOVD 168(R3), R12 + MOVD 192(R3), R13 + MOVD 216(R3), R3 + MOVD out_base+48(FP), R14 + MOVD start+72(FP), R15 + + // Add start offset to input + ADD R15, R1 + ADD R15, R4 + ADD R15, R5 + ADD R15, R8 + ADD R15, R9 + ADD R15, R10 + ADD R15, R11 + ADD R15, R12 + ADD R15, R13 + ADD R15, R3 + LSR $3, R15 + MOVD $15, R6 + VMOV R6, V20.B[0] + VDUP V20.B[0], V20.B16 + + // Load number of input shards + MOVD in_len+32(FP), R16 + +mulNeon_10x10Xor_loop: + MOVD matrix_base+0(FP), R2 + // Load and process 32 bytes from input 0 to 10 outputs + VLD1.P 32(R1), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + MOVD (R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V0.B16, V1.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + MOVD 24(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V2.B16, V3.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + MOVD 48(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V4.B16, V5.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + MOVD 72(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V6.B16, V7.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + MOVD 96(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V8.B16, V9.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + MOVD 120(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V10.B16, V11.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + MOVD 144(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V12.B16, V13.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + MOVD 168(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V14.B16, V15.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + MOVD 192(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V16.B16, V17.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + MOVD 216(R14), R6 + ADD R15<<3, R6 + VLD1 (R6), [V18.B16, V19.B16] + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $1, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 1 to 10 outputs + VLD1.P 32(R4), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $2, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 2 to 10 outputs + VLD1.P 32(R5), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $3, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 3 to 10 outputs + VLD1.P 32(R8), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $4, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 4 to 10 outputs + VLD1.P 32(R9), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $5, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 5 to 10 outputs + VLD1.P 32(R10), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $6, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 6 to 10 outputs + VLD1.P 32(R11), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $7, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 7 to 10 outputs + VLD1.P 32(R12), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $8, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 8 to 10 outputs + VLD1.P 32(R13), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + // Check for early termination + CMP $9, R16 + BEQ mulNeon_10x10Xor_store + + // Load and process 32 bytes from input 9 to 10 outputs + VLD1.P 32(R3), [V26.B16, V27.B16] + VUSHR $4, V26.B16, V28.B16 + VUSHR $4, V27.B16, V29.B16 + VAND V20.B16, V26.B16, V26.B16 + VAND V20.B16, V27.B16, V27.B16 + VAND V20.B16, V28.B16, V28.B16 + VAND V20.B16, V29.B16, V29.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V0.B16, V0.B16 + VEOR V23.B16, V1.B16, V1.B16 + VEOR V24.B16, V0.B16, V0.B16 + VEOR V25.B16, V1.B16, V1.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V2.B16, V2.B16 + VEOR V23.B16, V3.B16, V3.B16 + VEOR V24.B16, V2.B16, V2.B16 + VEOR V25.B16, V3.B16, V3.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V4.B16, V4.B16 + VEOR V23.B16, V5.B16, V5.B16 + VEOR V24.B16, V4.B16, V4.B16 + VEOR V25.B16, V5.B16, V5.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V6.B16, V6.B16 + VEOR V23.B16, V7.B16, V7.B16 + VEOR V24.B16, V6.B16, V6.B16 + VEOR V25.B16, V7.B16, V7.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V8.B16, V8.B16 + VEOR V23.B16, V9.B16, V9.B16 + VEOR V24.B16, V8.B16, V8.B16 + VEOR V25.B16, V9.B16, V9.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V10.B16, V10.B16 + VEOR V23.B16, V11.B16, V11.B16 + VEOR V24.B16, V10.B16, V10.B16 + VEOR V25.B16, V11.B16, V11.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V12.B16, V12.B16 + VEOR V23.B16, V13.B16, V13.B16 + VEOR V24.B16, V12.B16, V12.B16 + VEOR V25.B16, V13.B16, V13.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V14.B16, V14.B16 + VEOR V23.B16, V15.B16, V15.B16 + VEOR V24.B16, V14.B16, V14.B16 + VEOR V25.B16, V15.B16, V15.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V16.B16, V16.B16 + VEOR V23.B16, V17.B16, V17.B16 + VEOR V24.B16, V16.B16, V16.B16 + VEOR V25.B16, V17.B16, V17.B16 + VLD1.P 32(R2), [V22.B16, V23.B16] + VLD1.P 32(R2), [V24.B16, V25.B16] + VTBL V26.B16, [V22.B16], V22.B16 + VTBL V27.B16, [V23.B16], V23.B16 + VTBL V28.B16, [V24.B16], V24.B16 + VTBL V29.B16, [V25.B16], V25.B16 + VEOR V22.B16, V18.B16, V18.B16 + VEOR V23.B16, V19.B16, V19.B16 + VEOR V24.B16, V18.B16, V18.B16 + VEOR V25.B16, V19.B16, V19.B16 + +mulNeon_10x10Xor_store: + // Store 10 outputs + MOVD (R14), R6 + ADD R15<<3, R6 + VST1 [V0.D2, V1.D2], (R6) + MOVD 24(R14), R6 + ADD R15<<3, R6 + VST1 [V2.D2, V3.D2], (R6) + MOVD 48(R14), R6 + ADD R15<<3, R6 + VST1 [V4.D2, V5.D2], (R6) + MOVD 72(R14), R6 + ADD R15<<3, R6 + VST1 [V6.D2, V7.D2], (R6) + MOVD 96(R14), R6 + ADD R15<<3, R6 + VST1 [V8.D2, V9.D2], (R6) + MOVD 120(R14), R6 + ADD R15<<3, R6 + VST1 [V10.D2, V11.D2], (R6) + MOVD 144(R14), R6 + ADD R15<<3, R6 + VST1 [V12.D2, V13.D2], (R6) + MOVD 168(R14), R6 + ADD R15<<3, R6 + VST1 [V14.D2, V15.D2], (R6) + MOVD 192(R14), R6 + ADD R15<<3, R6 + VST1 [V16.D2, V17.D2], (R6) + MOVD 216(R14), R6 + ADD R15<<3, R6 + VST1 [V18.D2, V19.D2], (R6) + + // Prepare for next loop + ADD $4, R15 + SUBS $1, R0 + BNE mulNeon_10x10Xor_loop + +mulNeon_10x10Xor_end: + RET + diff --git a/galois_gen_none.go b/galois_gen_none.go index 1bb268a3..3e258986 100644 --- a/galois_gen_none.go +++ b/galois_gen_none.go @@ -1,33 +1,19 @@ -//go:build !amd64 || noasm || appengine || gccgo || nogen +//go:build !(amd64 || arm64) || noasm || appengine || gccgo || nogen package reedsolomon -const maxAvx2Inputs = 1 -const maxAvx2Outputs = 1 -const minAvx2Size = 1 -const avxSizeMask = 0 -const avx2CodeGen = false +const ( + codeGen = false + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 1 + codeGenMaxOutputs = 1 + minCodeGenSize = 1 +) -func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) hasCodeGen(int, int, int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } -func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") -} - -func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - panic("codegen not available") +func (r *reedSolomon) canGFNI(int, int, int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false } diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index 429e2c20..d4f46ea2 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -10,12 +10,39 @@ import ( ) const ( - avx2CodeGen = true - maxAvx2Inputs = 10 - maxAvx2Outputs = 10 - minAvx2Size = 64 + codeGen = true + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 ) +var ( + fAvx2 = galMulSlicesAvx2 + fAvx2Xor = galMulSlicesAvx2Xor + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return &fAvx2, &fAvx2Xor, codeGen && pshufb && r.o.useAVX2 && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { n := stop - start diff --git a/galois_gen_switch_arm64.go b/galois_gen_switch_arm64.go new file mode 100644 index 00000000..ff2541b8 --- /dev/null +++ b/galois_gen_switch_arm64.go @@ -0,0 +1,195 @@ +//go:build !appengine && !noasm && gc && !nogen && !nopshufb +// +build !appengine,!noasm,gc,!nogen,!nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + codeGen = true + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +var ( + fSve = galMulSlicesSve + fSveXor = galMulSlicesSveXor + fNeon = galMulSlicesNeon + fNeonXor = galMulSlicesNeonXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useSVE { + return &fSve, &fSveXor, codeGen && pshufb && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fNeon, &fNeonXor, codeGen && pshufb && r.o.useNEON && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +// galMulSlicesSve +func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop - start + + // fmt.Println(len(in), len(out)) + switch len(out) { + case 1: + mulSve_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulSve_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulSve_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulSve_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulSve_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulSve_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulSve_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulSve_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulSve_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulSve_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesSveXor +func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) + + switch len(out) { + case 1: + mulSve_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulSve_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulSve_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulSve_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulSve_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulSve_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulSve_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulSve_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulSve_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulSve_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM SVE: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesNeon +func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop - start + + switch len(out) { + case 1: + mulNeon_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulNeon_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulNeon_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulNeon_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulNeon_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulNeon_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulNeon_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulNeon_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulNeon_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulNeon_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) +} + +// galMulSlicesNeonXor +func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) + + switch len(out) { + case 1: + mulNeon_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulNeon_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulNeon_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulNeon_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulNeon_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulNeon_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulNeon_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulNeon_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulNeon_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulNeon_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + panic(fmt.Sprintf("ARM NEON: unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go index 1ba08b5e..66bab8a0 100644 --- a/galois_gen_switch_nopshufb_amd64.go +++ b/galois_gen_switch_nopshufb_amd64.go @@ -10,12 +10,35 @@ import ( ) const ( - avx2CodeGen = true - maxAvx2Inputs = 10 - maxAvx2Outputs = 10 - minAvx2Size = 64 + codeGen = true + codeGenMaxGoroutines = 8 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 ) +var ( + fGFNI = galMulSlicesGFNI + fGFNIXor = galMulSlicesGFNIXor + fAvxGFNI = galMulSlicesAvxGFNI + fAvxGFNIXor = galMulSlicesAvxGFNIXor +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false // no code generation for generic case (only GFNI cases) +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + if r.o.useAvx512GFNI { + return &fGFNI, &fGFNIXor, codeGen && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs + } + return &fAvxGFNI, &fAvxGFNIXor, codeGen && r.o.useAvxGNFI && + byteCount >= codeGenMinSize && inputs+outputs >= codeGenMinShards && + inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs +} + func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } diff --git a/galois_gen_switch_nopshufb_arm64.go b/galois_gen_switch_nopshufb_arm64.go new file mode 100644 index 00000000..db2aaa61 --- /dev/null +++ b/galois_gen_switch_nopshufb_arm64.go @@ -0,0 +1,22 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +const ( + codeGen = false + codeGenMaxGoroutines = 16 + codeGenMaxInputs = 10 + codeGenMaxOutputs = 10 + minCodeGenSize = 64 +) + +func (r *reedSolomon) hasCodeGen(byteCount int, inputs, outputs int) (_, _ *func(matrix []byte, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(matrix []uint64, in, out [][]byte, start, stop int) int, ok bool) { + return nil, nil, false +} diff --git a/galois_notamd64.go b/galois_notamd64.go deleted file mode 100644 index f98bfed1..00000000 --- a/galois_notamd64.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build !amd64 || noasm || appengine || gccgo || pshufb - -// Copyright 2020, Klaus Post, see LICENSE for details. - -package reedsolomon - -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { - panic("codeSomeShardsAvx512 should not be called if built without asm") -} - -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { - panic("codeSomeShardsAvx512P should not be called if built without asm") -} diff --git a/galois_test.go b/galois_test.go index 9ea5ff5b..580b216c 100644 --- a/galois_test.go +++ b/galois_test.go @@ -9,6 +9,10 @@ package reedsolomon import ( "bytes" + "crypto/rand" + "encoding/hex" + "fmt" + mathrand "math/rand" "testing" ) @@ -231,6 +235,172 @@ func TestSliceGalAdd(t *testing.T) { } } +func testGenGalois(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { + + // reference versions + galMulSliceRef := func(c byte, in, out []byte) { + out = out[:len(in)] + mt := mulTable[c][:256] + for n, input := range in { + out[n] = mt[input] + } + } + galMulSliceXorRef := func(c byte, in, out []byte) { + out = out[:len(in)] + mt := mulTable[c][:256] + for n, input := range in { + out[n] ^= mt[input] + } + } + + outputs := make([][]byte, len(matrixRows)) + for i := range outputs { + outputs[i] = make([]byte, size) + if _, err := rand.Read(outputs[i]); err != nil { + t.Fatalf("error: %v", err) + return + } + } + inputs := make([][]byte, len(matrixRows[0])) + for i := range inputs { + inputs[i] = make([]byte, size) + if _, err := rand.Read(inputs[i]); err != nil { + t.Fatalf("error: %v", err) + return + } + } + + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil) + + end := start + f(m, inputs, outputs, start, stop) + if end != stop { + t.Errorf("got %#v, expected %#v", end, stop) + } + + wanteds := make([][]byte, len(outputs)) + for i := range wanteds { + wanteds[i] = make([]byte, size) + galMulSliceRef(matrixRows[i][0], inputs[0], wanteds[i]) + for j := 1; j < len(matrixRows[i]); j++ { + galMulSliceXorRef(matrixRows[i][j], inputs[j], wanteds[i]) + } + } + + for i := range outputs { + if !bytes.Equal(outputs[i][start:stop], wanteds[i][start:stop]) { + t.Errorf("testGenGalois(%dx%d): got %#v, expected %#v", len(inputs), len(outputs), outputs[i][start:stop], wanteds[i][start:stop]) + fmt.Printf("output[%d]\n", i) + fmt.Print(hex.Dump(outputs[i][start:stop])) + fmt.Printf("wanted[%d]\n", i) + fmt.Print(hex.Dump(wanteds[i][start:stop])) + } + } +} + +func testGenGaloisXor(t *testing.T, matrixRows [][]byte, size, start, stop int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { + + // reference version + galMulSliceXorRef := func(c byte, in, out []byte) { + out = out[:len(in)] + mt := mulTable[c][:256] + for n, input := range in { + out[n] ^= mt[input] + } + } + + outputs := make([][]byte, len(matrixRows)) + wanteds := make([][]byte, len(outputs)) + for i := range outputs { + outputs[i] = make([]byte, size) + wanteds[i] = make([]byte, size) + + // For Xor tests, prefill both outputs and wanted with identical values + copy(outputs[i], bytes.Repeat([]byte{byte(i)}, size)) + copy(wanteds[i], outputs[i]) + } + inputs := make([][]byte, len(matrixRows[0])) + for i := range inputs { + inputs[i] = make([]byte, size) + if _, err := rand.Read(inputs[i]); err != nil { + t.Fatalf("error: %v", err) + return + } + } + + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), nil) + + end := start + f(m, inputs, outputs, start, stop) + if end != stop { + t.Errorf("got %#v, expected %#v", end, stop) + } + + for i := range wanteds { + for j := 0; j < len(matrixRows[i]); j++ { + galMulSliceXorRef(matrixRows[i][j], inputs[j], wanteds[i]) + } + } + + for i := range outputs { + if !bytes.Equal(outputs[i][start:stop], wanteds[i][start:stop]) { + t.Errorf("testGenGaloisXor(%dx%d): got %#v, expected %#v", len(inputs), len(outputs), outputs[i][start:stop], wanteds[i][start:stop]) + fmt.Printf("output[%d]\n", i) + fmt.Print(hex.Dump(outputs[i][start:stop])) + fmt.Printf("wanted[%d]\n", i) + fmt.Print(hex.Dump(wanteds[i][start:stop])) + } + } +} + +// Test early abort for galMulARCH_?x?_* routines +func testGenGaloisEarlyAbort(t *testing.T, matrixRows [][]byte, size int, f func(matrix []byte, in, out [][]byte, start, stop int) int) { + outputs := make([][]byte, len(matrixRows)) + inputs := make([][]byte, len(matrixRows[0])) + + start := 0 + start += f(nil, inputs, outputs, 0, size) + if start != 0 { + t.Errorf("got %#v, expected %#v", start, 0) + } +} + +func testGenGaloisUpto10x10(t *testing.T, f, fXor func(matrix []byte, in, out [][]byte, start, stop int) int) { + + for output := 1; output <= codeGenMaxOutputs; output++ { + for input := 1; input <= codeGenMaxInputs; input++ { + matrixRows := make([][]byte, input) + for i := range matrixRows { + matrixRows[i] = make([]byte, output) + for j := range matrixRows[i] { + matrixRows[i][j] = byte(mathrand.Intn(16)) + } + } + + size, stepsize := 32, 32 + if input <= 3 { + size, stepsize = 64, 64 // 3x? are all _64 versions + } + + // test early abort + testGenGaloisEarlyAbort(t, matrixRows, size-1, f) + testGenGaloisEarlyAbort(t, matrixRows, size-1, fXor) + const limit = 1024 + for ; size < limit; size += stepsize { + // test full range + testGenGalois(t, matrixRows, size, 0, size, f) + testGenGaloisXor(t, matrixRows, size, 0, size, fXor) + + if size >= stepsize*2 && size < limit-stepsize*2 { + start := stepsize + stop := size - start + // test partial range + testGenGalois(t, matrixRows, size, start, stop, f) + testGenGaloisXor(t, matrixRows, size, start, stop, fXor) + } + } + } + } +} + func benchmarkGalois(b *testing.B, size int) { in := make([]byte, size) out := make([]byte, size) diff --git a/options.go b/options.go index 73cc7d6d..377137ef 100644 --- a/options.go +++ b/options.go @@ -21,7 +21,9 @@ type options struct { useAVX512, useAVX2, useSSSE3, - useSSE2 bool + useSSE2, + useNEON, + useSVE bool useJerasureMatrix bool usePAR1Matrix bool @@ -51,6 +53,8 @@ var defaultOptions = options{ useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), + useNEON: cpuid.CPU.Supports(cpuid.ASIMD), + useSVE: cpuid.CPU.Supports(cpuid.SVE), } // leopardMode controls the use of leopard GF in encoding and decoding. @@ -316,6 +320,11 @@ func (o *options) cpuOptions() string { if o.useAvxGNFI { res = append(res, "AVX+GFNI") } + if o.useSVE { + res = append(res, "ARM+SVE") + } else if o.useNEON { + res = append(res, "ARM+NEON") + } if len(res) == 0 { return "pure Go" } diff --git a/reedsolomon.go b/reedsolomon.go index bebba044..3b6f5b78 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -153,9 +153,8 @@ type Extensions interface { } const ( - avx2CodeGenMinSize = 64 - avx2CodeGenMinShards = 3 - avx2CodeGenMaxGoroutines = 8 + codeGenMinSize = 64 + codeGenMinShards = 3 gfniCodeGenMaxGoroutines = 4 intSize = 32 << (^uint(0) >> 63) // 32 or 64 @@ -482,21 +481,23 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.o.perRound = 128 << 10 } + _, _, useCodeGen := r.hasCodeGen(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs) + divide := parityShards + 1 - if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { + if codeGen && useCodeGen && (dataShards > codeGenMaxInputs || parityShards > codeGenMaxOutputs) { // Base on L1 cache if we have many inputs. r.o.perRound = cpuid.CPU.Cache.L1D if r.o.perRound < 32<<10 { r.o.perRound = 32 << 10 } divide = 0 - if dataShards > maxAvx2Inputs { - divide += maxAvx2Inputs + if dataShards > codeGenMaxInputs { + divide += codeGenMaxInputs } else { divide += dataShards } - if parityShards > maxAvx2Inputs { - divide += maxAvx2Outputs + if parityShards > codeGenMaxInputs { + divide += codeGenMaxOutputs } else { divide += parityShards } @@ -555,11 +556,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Generated AVX2 does not need data to stay in L1 cache between runs. // We will be purely limited by RAM speed. - if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { - r.o.maxGoroutines = avx2CodeGenMaxGoroutines + if useCodeGen && r.o.maxGoroutines > codeGenMaxGoroutines { + r.o.maxGoroutines = codeGenMaxGoroutines } - if r.canGFNI(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { + if _, _, useGFNI := r.canGFNI(codeGenMinSize, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { r.o.maxGoroutines = gfniCodeGenMaxGoroutines } @@ -577,7 +578,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r.parity[i] = r.m[dataShards+i] } - if avx2CodeGen && r.o.useAVX2 { + if codeGen /* && r.o.useAVX2 */ { sz := r.dataShards * r.parityShards * 2 * 32 r.mPool.New = func() interface{} { return AllocAligned(1, sz)[0] @@ -653,15 +654,15 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvxGNFI) { + if codeGen && len(dataShard) >= r.o.perRound && len(parity) >= codeGenMinShards && (pshufb || r.o.useAvx512GFNI || r.o.useAvxGNFI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] } if r.o.useAvx512GFNI || r.o.useAvxGNFI { - r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) + r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } else { - r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) + r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false, nil, nil) } return nil } @@ -803,18 +804,6 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil } -func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && pshufb && r.o.useAVX2 && - byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && - inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs -} - -func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) && - byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && - inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs -} - // Multiplies a subset of rows from a coding matrix by a full set of // input totalShards to produce some output totalShards. // 'matrixRows' is The rows from the matrix to use. @@ -838,22 +827,18 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if end > len(inputs[0]) { end = len(inputs[0]) } - if r.canGFNI(byteCount, len(inputs), len(outputs)) { - var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)); useGFNI { + var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) - if r.o.useAvx512GFNI { - start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) - } else { - start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount) - } + start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount) end = len(inputs[0]) - } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { - m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) - start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) + } else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok { + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + start += (*galMulGen)(m, inputs, outputs, 0, byteCount) r.putTmpSlice(m) end = len(inputs[0]) - } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { - var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount, codeGenMaxInputs, codeGenMaxOutputs); len(inputs)+len(outputs) > codeGenMinShards && ok { + var gfni [codeGenMaxInputs * codeGenMaxOutputs]uint64 end = len(inputs[0]) inIdx := 0 m := r.getTmpSlice() @@ -861,36 +846,29 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } - if r.o.useAvx512GFNI { - m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) - if inIdx == 0 { - start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) - } else { - start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) - } - } else if r.o.useAvxGNFI { + if useGFNI { m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) if inIdx == 0 { - start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNI)(m, inPer, outPer, 0, byteCount) } else { - start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount) + start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount) } } else { - m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) if inIdx == 0 { - start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + start = (*galMulGen)(m, inPer, outPer, 0, byteCount) } else { - start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + start = (*galMulGenXor)(m, inPer, outPer, 0, byteCount) } } outIdx += len(outPer) @@ -928,27 +906,27 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte var wg sync.WaitGroup gor := r.o.maxGoroutines - var avx2Matrix []byte + var genMatrix []byte var gfniMatrix []uint64 - useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) - useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) + galMulGen, _, useCodeGen := r.hasCodeGen(byteCount, len(inputs), len(outputs)) + galMulGFNI, _, useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) if useGFNI { - var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64 + var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64 gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) - } else if useAvx2 { - avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) - defer r.putTmpSlice(avx2Matrix) - } else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + } else if useCodeGen { + genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + defer r.putTmpSlice(genMatrix) + } else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true, galMulGFNI, galMulGFNIXor) return - } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && - r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + } else if galMulGen, galMulGenXor, ok := r.hasCodeGen(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); ok && + byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { // It appears there is a switchover point at around 10MB where // Regular processing is faster... - r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true) + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true, galMulGen, galMulGenXor) return } @@ -960,13 +938,9 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte exec := func(start, stop int) { if stop-start >= 64 { if useGFNI { - if r.o.useAvx512GFNI { - start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) - } else { - start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop) - } - } else if useAvx2 { - start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + start += (*galMulGFNI)(gfniMatrix, inputs, outputs, start, stop) + } else if useCodeGen { + start += (*galMulGen)(genMatrix, inputs, outputs, start, stop) } } @@ -1017,7 +991,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGen, galMulGenXor *func(matrix []byte, in [][]byte, out [][]byte, start int, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1028,7 +1002,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b first bool } // Make a plan... - plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs)) tmp := r.getTmpSlice() defer r.putTmpSlice(tmp) @@ -1040,18 +1014,18 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix - m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) tmp = tmp[len(m):] plan = append(plan, state{ input: inPer, @@ -1070,19 +1044,19 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } inIdx := 0 ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } // Generate local matrix - m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) tmp = tmp[len(m):] //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) plan = append(plan, state{ @@ -1111,14 +1085,14 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minAvx2Size { + if galMulGen != nil && galMulGenXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... var n int for _, p := range plan { if p.first { - n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGen)(p.m, p.input, p.output, lstart, lstop) } else { - n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + n = (*galMulGenXor)(p.m, p.input, p.output, lstart, lstop) } } lstart += n @@ -1172,7 +1146,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b // Perform the same as codeSomeShards, but split the workload into // several goroutines. // If clear is set, the first write will overwrite the output. -func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool, galMulGFNI, galMulGFNIXor *func(matrix []uint64, in, out [][]byte, start, stop int) int) { var wg sync.WaitGroup gor := r.o.maxGoroutines @@ -1183,7 +1157,7 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b first bool } // Make a plan... - plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + plan := make([]state, 0, ((len(inputs)+codeGenMaxInputs-1)/codeGenMaxInputs)*((len(outputs)+codeGenMaxOutputs-1)/codeGenMaxOutputs)) // Flips between input first to output first. // We put the smallest data load in the inner loop. @@ -1192,15 +1166,15 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } outs := outputs outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) @@ -1221,16 +1195,16 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b outIdx := 0 for len(outs) > 0 { outPer := outs - if len(outPer) > maxAvx2Outputs { - outPer = outPer[:maxAvx2Outputs] + if len(outPer) > codeGenMaxOutputs { + outPer = outPer[:codeGenMaxOutputs] } inIdx := 0 ins := inputs for len(ins) > 0 { inPer := ins - if len(inPer) > maxAvx2Inputs { - inPer = inPer[:maxAvx2Inputs] + if len(inPer) > codeGenMaxInputs { + inPer = inPer[:codeGenMaxInputs] } // Generate local matrix m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) @@ -1261,24 +1235,14 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b lstop = stop } for lstart < stop { - if lstop-lstart >= minAvx2Size { + if galMulGFNI != nil && galMulGFNIXor != nil && lstop-lstart >= minCodeGenSize { // Execute plan... var n int - if r.o.useAvx512GFNI { - for _, p := range plan { - if p.first { - n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) - } - } - } else { - for _, p := range plan { - if p.first { - n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop) - } + for _, p := range plan { + if p.first { + n = (*galMulGFNI)(p.m, p.input, p.output, lstart, lstop) + } else { + n = (*galMulGFNIXor)(p.m, p.input, p.output, lstart, lstop) } } lstart += n