Skip to content

Commit 0195659

Browse files
author
Ard Biesheuvel
committed
ARM: crypto: add NEON accelerated XOR implementation
Add a source file xor-neon.c (which is really just the reference C implementation passed through the GCC vectorizer) and hook it up to the XOR framework. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Acked-by: Nicolas Pitre <nico@linaro.org>
1 parent 73c132c commit 0195659

File tree

3 files changed

+121
-0
lines changed

3 files changed

+121
-0
lines changed

arch/arm/include/asm/xor.h

+73
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
* it under the terms of the GNU General Public License version 2 as
88
* published by the Free Software Foundation.
99
*/
10+
#include <linux/hardirq.h>
1011
#include <asm-generic/xor.h>
12+
#include <asm/hwcap.h>
13+
#include <asm/neon.h>
1114

1215
#define __XOR(a1, a2) a1 ^= a2
1316

@@ -138,4 +141,74 @@ static struct xor_block_template xor_block_arm4regs = {
138141
xor_speed(&xor_block_arm4regs); \
139142
xor_speed(&xor_block_8regs); \
140143
xor_speed(&xor_block_32regs); \
144+
NEON_TEMPLATES; \
141145
} while (0)
146+
147+
#ifdef CONFIG_KERNEL_MODE_NEON
148+
149+
extern struct xor_block_template const xor_block_neon_inner;
150+
151+
static void
152+
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
153+
{
154+
if (in_interrupt()) {
155+
xor_arm4regs_2(bytes, p1, p2);
156+
} else {
157+
kernel_neon_begin();
158+
xor_block_neon_inner.do_2(bytes, p1, p2);
159+
kernel_neon_end();
160+
}
161+
}
162+
163+
static void
164+
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
165+
unsigned long *p3)
166+
{
167+
if (in_interrupt()) {
168+
xor_arm4regs_3(bytes, p1, p2, p3);
169+
} else {
170+
kernel_neon_begin();
171+
xor_block_neon_inner.do_3(bytes, p1, p2, p3);
172+
kernel_neon_end();
173+
}
174+
}
175+
176+
static void
177+
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
178+
unsigned long *p3, unsigned long *p4)
179+
{
180+
if (in_interrupt()) {
181+
xor_arm4regs_4(bytes, p1, p2, p3, p4);
182+
} else {
183+
kernel_neon_begin();
184+
xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
185+
kernel_neon_end();
186+
}
187+
}
188+
189+
static void
190+
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
191+
unsigned long *p3, unsigned long *p4, unsigned long *p5)
192+
{
193+
if (in_interrupt()) {
194+
xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
195+
} else {
196+
kernel_neon_begin();
197+
xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
198+
kernel_neon_end();
199+
}
200+
}
201+
202+
static struct xor_block_template xor_block_neon = {
203+
.name = "neon",
204+
.do_2 = xor_neon_2,
205+
.do_3 = xor_neon_3,
206+
.do_4 = xor_neon_4,
207+
.do_5 = xor_neon_5
208+
};
209+
210+
#define NEON_TEMPLATES \
211+
do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
212+
#else
213+
#define NEON_TEMPLATES
214+
#endif

arch/arm/lib/Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,9 @@ lib-$(CONFIG_ARCH_SHARK) += io-shark.o
4545

4646
$(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S
4747
$(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S
48+
49+
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
50+
NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon
51+
CFLAGS_xor-neon.o += $(NEON_FLAGS)
52+
lib-$(CONFIG_XOR_BLOCKS) += xor-neon.o
53+
endif

arch/arm/lib/xor-neon.c

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* linux/arch/arm/lib/xor-neon.c
3+
*
4+
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5+
*
6+
* This program is free software; you can redistribute it and/or modify
7+
* it under the terms of the GNU General Public License version 2 as
8+
* published by the Free Software Foundation.
9+
*/
10+
11+
#include <linux/raid/xor.h>
12+
13+
#ifndef __ARM_NEON__
14+
#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
15+
#endif
16+
17+
/*
18+
* Pull in the reference implementations while instructing GCC (through
19+
* -ftree-vectorize) to attempt to exploit implicit parallelism and emit
20+
* NEON instructions.
21+
*/
22+
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
23+
#pragma GCC optimize "tree-vectorize"
24+
#else
25+
/*
26+
* While older versions of GCC do not generate incorrect code, they fail to
27+
* recognize the parallel nature of these functions, and emit plain ARM code,
28+
* which is known to be slower than the optimized ARM code in asm-arm/xor.h.
29+
*/
30+
#warning This code requires at least version 4.6 of GCC
31+
#endif
32+
33+
#pragma GCC diagnostic ignored "-Wunused-variable"
34+
#include <asm-generic/xor.h>
35+
36+
struct xor_block_template const xor_block_neon_inner = {
37+
.name = "__inner_neon__",
38+
.do_2 = xor_8regs_2,
39+
.do_3 = xor_8regs_3,
40+
.do_4 = xor_8regs_4,
41+
.do_5 = xor_8regs_5,
42+
};

0 commit comments

Comments
 (0)