@@ -229,6 +229,125 @@ do { \
229229 } \
230230})
231231
232+ /*
233+ * Add return operation
234+ */
235+ #define percpu_add_return_op (var , val ) \
236+ ({ \
237+ typeof(var) paro_ret__ = val; \
238+ switch (sizeof(var)) { \
239+ case 1: \
240+ asm("xaddb %0, "__percpu_arg(1) \
241+ : "+q" (paro_ret__), "+m" (var) \
242+ : : "memory"); \
243+ break; \
244+ case 2: \
245+ asm("xaddw %0, "__percpu_arg(1) \
246+ : "+r" (paro_ret__), "+m" (var) \
247+ : : "memory"); \
248+ break; \
249+ case 4: \
250+ asm("xaddl %0, "__percpu_arg(1) \
251+ : "+r" (paro_ret__), "+m" (var) \
252+ : : "memory"); \
253+ break; \
254+ case 8: \
255+ asm("xaddq %0, "__percpu_arg(1) \
256+ : "+re" (paro_ret__), "+m" (var) \
257+ : : "memory"); \
258+ break; \
259+ default: __bad_percpu_size(); \
260+ } \
261+ paro_ret__ += val; \
262+ paro_ret__; \
263+ })
264+
265+ /*
266+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
267+ * expensive due to the implied lock prefix. The processor cannot prefetch
268+ * cachelines if xchg is used.
269+ */
270+ #define percpu_xchg_op (var , nval ) \
271+ ({ \
272+ typeof(var) pxo_ret__; \
273+ typeof(var) pxo_new__ = (nval); \
274+ switch (sizeof(var)) { \
275+ case 1: \
276+ asm("\n1:mov "__percpu_arg(1)",%%al" \
277+ "\n\tcmpxchgb %2, "__percpu_arg(1) \
278+ "\n\tjnz 1b" \
279+ : "=a" (pxo_ret__), "+m" (var) \
280+ : "q" (pxo_new__) \
281+ : "memory"); \
282+ break; \
283+ case 2: \
284+ asm("\n1:mov "__percpu_arg(1)",%%ax" \
285+ "\n\tcmpxchgw %2, "__percpu_arg(1) \
286+ "\n\tjnz 1b" \
287+ : "=a" (pxo_ret__), "+m" (var) \
288+ : "r" (pxo_new__) \
289+ : "memory"); \
290+ break; \
291+ case 4: \
292+ asm("\n1:mov "__percpu_arg(1)",%%eax" \
293+ "\n\tcmpxchgl %2, "__percpu_arg(1) \
294+ "\n\tjnz 1b" \
295+ : "=a" (pxo_ret__), "+m" (var) \
296+ : "r" (pxo_new__) \
297+ : "memory"); \
298+ break; \
299+ case 8: \
300+ asm("\n1:mov "__percpu_arg(1)",%%rax" \
301+ "\n\tcmpxchgq %2, "__percpu_arg(1) \
302+ "\n\tjnz 1b" \
303+ : "=a" (pxo_ret__), "+m" (var) \
304+ : "r" (pxo_new__) \
305+ : "memory"); \
306+ break; \
307+ default: __bad_percpu_size(); \
308+ } \
309+ pxo_ret__; \
310+ })
311+
312+ /*
313+ * cmpxchg has no such implied lock semantics as a result it is much
314+ * more efficient for cpu local operations.
315+ */
316+ #define percpu_cmpxchg_op (var , oval , nval ) \
317+ ({ \
318+ typeof(var) pco_ret__; \
319+ typeof(var) pco_old__ = (oval); \
320+ typeof(var) pco_new__ = (nval); \
321+ switch (sizeof(var)) { \
322+ case 1: \
323+ asm("cmpxchgb %2, "__percpu_arg(1) \
324+ : "=a" (pco_ret__), "+m" (var) \
325+ : "q" (pco_new__), "0" (pco_old__) \
326+ : "memory"); \
327+ break; \
328+ case 2: \
329+ asm("cmpxchgw %2, "__percpu_arg(1) \
330+ : "=a" (pco_ret__), "+m" (var) \
331+ : "r" (pco_new__), "0" (pco_old__) \
332+ : "memory"); \
333+ break; \
334+ case 4: \
335+ asm("cmpxchgl %2, "__percpu_arg(1) \
336+ : "=a" (pco_ret__), "+m" (var) \
337+ : "r" (pco_new__), "0" (pco_old__) \
338+ : "memory"); \
339+ break; \
340+ case 8: \
341+ asm("cmpxchgq %2, "__percpu_arg(1) \
342+ : "=a" (pco_ret__), "+m" (var) \
343+ : "r" (pco_new__), "0" (pco_old__) \
344+ : "memory"); \
345+ break; \
346+ default: __bad_percpu_size(); \
347+ } \
348+ pco_ret__; \
349+ })
350+
232351/*
233352 * percpu_read() makes gcc load the percpu variable every time it is
234353 * accessed while percpu_read_stable() allows the value to be cached.
@@ -267,6 +386,12 @@ do { \
267386#define __this_cpu_xor_1 (pcp , val ) percpu_to_op("xor", (pcp), val)
268387#define __this_cpu_xor_2 (pcp , val ) percpu_to_op("xor", (pcp), val)
269388#define __this_cpu_xor_4 (pcp , val ) percpu_to_op("xor", (pcp), val)
389+ /*
390+ * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
391+ * faster than an xchg with forced lock semantics.
392+ */
393+ #define __this_cpu_xchg_8 (pcp , nval ) percpu_xchg_op(pcp, nval)
394+ #define __this_cpu_cmpxchg_8 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
270395
271396#define this_cpu_read_1 (pcp ) percpu_from_op("mov", (pcp), "m"(pcp))
272397#define this_cpu_read_2 (pcp ) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -286,6 +411,11 @@ do { \
286411#define this_cpu_xor_1 (pcp , val ) percpu_to_op("xor", (pcp), val)
287412#define this_cpu_xor_2 (pcp , val ) percpu_to_op("xor", (pcp), val)
288413#define this_cpu_xor_4 (pcp , val ) percpu_to_op("xor", (pcp), val)
414+ #define this_cpu_xchg_1 (pcp , nval ) percpu_xchg_op(pcp, nval)
415+ #define this_cpu_xchg_2 (pcp , nval ) percpu_xchg_op(pcp, nval)
416+ #define this_cpu_xchg_4 (pcp , nval ) percpu_xchg_op(pcp, nval)
417+ #define this_cpu_xchg_8 (pcp , nval ) percpu_xchg_op(pcp, nval)
418+ #define this_cpu_cmpxchg_8 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
289419
290420#define irqsafe_cpu_add_1 (pcp , val ) percpu_add_op((pcp), val)
291421#define irqsafe_cpu_add_2 (pcp , val ) percpu_add_op((pcp), val)
@@ -299,6 +429,31 @@ do { \
299429#define irqsafe_cpu_xor_1 (pcp , val ) percpu_to_op("xor", (pcp), val)
300430#define irqsafe_cpu_xor_2 (pcp , val ) percpu_to_op("xor", (pcp), val)
301431#define irqsafe_cpu_xor_4 (pcp , val ) percpu_to_op("xor", (pcp), val)
432+ #define irqsafe_cpu_xchg_1 (pcp , nval ) percpu_xchg_op(pcp, nval)
433+ #define irqsafe_cpu_xchg_2 (pcp , nval ) percpu_xchg_op(pcp, nval)
434+ #define irqsafe_cpu_xchg_4 (pcp , nval ) percpu_xchg_op(pcp, nval)
435+ #define irqsafe_cpu_xchg_8 (pcp , nval ) percpu_xchg_op(pcp, nval)
436+ #define irqsafe_cpu_cmpxchg_8 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
437+
438+ #ifndef CONFIG_M386
439+ #define __this_cpu_add_return_1 (pcp , val ) percpu_add_return_op(pcp, val)
440+ #define __this_cpu_add_return_2 (pcp , val ) percpu_add_return_op(pcp, val)
441+ #define __this_cpu_add_return_4 (pcp , val ) percpu_add_return_op(pcp, val)
442+ #define __this_cpu_cmpxchg_1 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
443+ #define __this_cpu_cmpxchg_2 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
444+ #define __this_cpu_cmpxchg_4 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
445+
446+ #define this_cpu_add_return_1 (pcp , val ) percpu_add_return_op(pcp, val)
447+ #define this_cpu_add_return_2 (pcp , val ) percpu_add_return_op(pcp, val)
448+ #define this_cpu_add_return_4 (pcp , val ) percpu_add_return_op(pcp, val)
449+ #define this_cpu_cmpxchg_1 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
450+ #define this_cpu_cmpxchg_2 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
451+ #define this_cpu_cmpxchg_4 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
452+
453+ #define irqsafe_cpu_cmpxchg_1 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
454+ #define irqsafe_cpu_cmpxchg_2 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
455+ #define irqsafe_cpu_cmpxchg_4 (pcp , oval , nval ) percpu_cmpxchg_op(pcp, oval, nval)
456+ #endif /* !CONFIG_M386 */
302457
303458/*
304459 * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -311,19 +466,20 @@ do { \
311466#define __this_cpu_and_8 (pcp , val ) percpu_to_op("and", (pcp), val)
312467#define __this_cpu_or_8 (pcp , val ) percpu_to_op("or", (pcp), val)
313468#define __this_cpu_xor_8 (pcp , val ) percpu_to_op("xor", (pcp), val)
469+ #define __this_cpu_add_return_8 (pcp , val ) percpu_add_return_op(pcp, val)
314470
315471#define this_cpu_read_8 (pcp ) percpu_from_op("mov", (pcp), "m"(pcp))
316472#define this_cpu_write_8 (pcp , val ) percpu_to_op("mov", (pcp), val)
317473#define this_cpu_add_8 (pcp , val ) percpu_add_op((pcp), val)
318474#define this_cpu_and_8 (pcp , val ) percpu_to_op("and", (pcp), val)
319475#define this_cpu_or_8 (pcp , val ) percpu_to_op("or", (pcp), val)
320476#define this_cpu_xor_8 (pcp , val ) percpu_to_op("xor", (pcp), val)
477+ #define this_cpu_add_return_8 (pcp , val ) percpu_add_return_op(pcp, val)
321478
322479#define irqsafe_cpu_add_8 (pcp , val ) percpu_add_op((pcp), val)
323480#define irqsafe_cpu_and_8 (pcp , val ) percpu_to_op("and", (pcp), val)
324481#define irqsafe_cpu_or_8 (pcp , val ) percpu_to_op("or", (pcp), val)
325482#define irqsafe_cpu_xor_8 (pcp , val ) percpu_to_op("xor", (pcp), val)
326-
327483#endif
328484
329485/* This is not atomic against other CPUs -- CPU preemption needs to be off */
0 commit comments