From 0b50a155aace08a159882ae7442c1499bbdc833d Mon Sep 17 00:00:00 2001
From: Robert Hallberg <Robert.Hallberg@noaa.gov>
Date: Fri, 1 Mar 2024 12:45:21 -0500
Subject: [PATCH] (*)Rearrange calc_kappa_shear_vertex for FMAs

  Added mathematically equivalent rearrangements of the code in
calc_kappa_shear_vertex that interpolates velocities, temperatures and
salinities to the vertices to expose the mask variables while ensuring that the
other multiplications occur within parentheses so that they will exhibit
rotational symmetry when fused-multiply-adds are enabled.  FMAs can still occur,
but it will be multiplication by the 0-or-1 masks that are fused with an
addition.  Also added parentheses to 3 expressions calculating the squared shear
in calculate_projected_state for rotational symmetry with FMAs.  All answers are
bitwise identical in cases without FMAs, but answers could change when FMAs are
enabled.
---
 .../vertical/MOM_kappa_shear.F90              | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/parameterizations/vertical/MOM_kappa_shear.F90 b/src/parameterizations/vertical/MOM_kappa_shear.F90
index d44ab79d1b..536d25e595 100644
--- a/src/parameterizations/vertical/MOM_kappa_shear.F90
+++ b/src/parameterizations/vertical/MOM_kappa_shear.F90
@@ -442,26 +442,26 @@ subroutine Calc_kappa_shear_vertex(u_in, v_in, h, T_in, S_in, tv, p_surf, kappa_
 
     ! Interpolate the various quantities to the corners, using masks.
     do k=1,nz ; do I=IsB,IeB
-      u_2d(I,k) = (u_in(I,j,k)   * (G%mask2dCu(I,j)   * (h(i,j,k)   + h(i+1,j,k))) + &
-                   u_in(I,j+1,k) * (G%mask2dCu(I,j+1) * (h(i,j+1,k) + h(i+1,j+1,k))) ) / &
+      u_2d(I,k) = (G%mask2dCu(I,j)   * (u_in(I,j,k)   * (h(i,j,k)   + h(i+1,j,k))) + &
+                   G%mask2dCu(I,j+1) * (u_in(I,j+1,k) * (h(i,j+1,k) + h(i+1,j+1,k))) ) / &
                   ((G%mask2dCu(I,j)   * (h(i,j,k)   + h(i+1,j,k)) + &
                     G%mask2dCu(I,j+1) * (h(i,j+1,k) + h(i+1,j+1,k))) + GV%H_subroundoff)
-      v_2d(I,k) = (v_in(i,J,k)   * (G%mask2dCv(i,J)   * (h(i,j,k)   + h(i,j+1,k))) + &
-                   v_in(i+1,J,k) * (G%mask2dCv(i+1,J) * (h(i+1,j,k) + h(i+1,j+1,k))) ) / &
+      v_2d(I,k) = (G%mask2dCv(i,J)   * (v_in(i,J,k)   * (h(i,j,k)   + h(i,j+1,k))) + &
+                   G%mask2dCv(i+1,J) * (v_in(i+1,J,k) * (h(i+1,j,k) + h(i+1,j+1,k))) ) / &
                   ((G%mask2dCv(i,J)   * (h(i,j,k)   + h(i,j+1,k)) + &
                     G%mask2dCv(i+1,J) * (h(i+1,j,k) + h(i+1,j+1,k))) + GV%H_subroundoff)
       I_hwt = 1.0 / (((G%mask2dT(i,j) * h(i,j,k) + G%mask2dT(i+1,j+1) * h(i+1,j+1,k)) + &
                       (G%mask2dT(i+1,j) * h(i+1,j,k) + G%mask2dT(i,j+1) * h(i,j+1,k))) + &
                      GV%H_subroundoff)
       if (use_temperature) then
-        T_2d(I,k) = ( ((G%mask2dT(i,j) * h(i,j,k)) * T_in(i,j,k) + &
-                       (G%mask2dT(i+1,j+1) * h(i+1,j+1,k)) * T_in(i+1,j+1,k)) + &
-                      ((G%mask2dT(i+1,j) * h(i+1,j,k)) * T_in(i+1,j,k) + &
-                       (G%mask2dT(i,j+1) * h(i,j+1,k)) * T_in(i,j+1,k)) ) * I_hwt
-        S_2d(I,k) = ( ((G%mask2dT(i,j) * h(i,j,k)) * S_in(i,j,k) + &
-                       (G%mask2dT(i+1,j+1) * h(i+1,j+1,k)) * S_in(i+1,j+1,k)) + &
-                      ((G%mask2dT(i+1,j) * h(i+1,j,k)) * S_in(i+1,j,k) + &
-                       (G%mask2dT(i,j+1) * h(i,j+1,k)) * S_in(i,j+1,k)) ) * I_hwt
+        T_2d(I,k) = ( (G%mask2dT(i,j) * (h(i,j,k) * T_in(i,j,k)) + &
+                       G%mask2dT(i+1,j+1) * (h(i+1,j+1,k) * T_in(i+1,j+1,k))) + &
+                      (G%mask2dT(i+1,j) * (h(i+1,j,k) * T_in(i+1,j,k)) + &
+                       G%mask2dT(i,j+1) * (h(i,j+1,k) * T_in(i,j+1,k))) ) * I_hwt
+        S_2d(I,k) = ( (G%mask2dT(i,j) * (h(i,j,k) * S_in(i,j,k)) + &
+                       G%mask2dT(i+1,j+1) * (h(i+1,j+1,k) * S_in(i+1,j+1,k))) + &
+                      (G%mask2dT(i+1,j) * (h(i+1,j,k) * S_in(i+1,j,k)) + &
+                       G%mask2dT(i,j+1) * (h(i,j+1,k) * S_in(i,j+1,k))) ) * I_hwt
       endif
       h_2d(I,k) = ((G%mask2dT(i,j) * h(i,j,k) + G%mask2dT(i+1,j+1) * h(i+1,j+1,k)) + &
                    (G%mask2dT(i+1,j) * h(i+1,j,k) + G%mask2dT(i,j+1) * h(i,j+1,k)) ) / &
@@ -472,8 +472,8 @@ subroutine Calc_kappa_shear_vertex(u_in, v_in, h, T_in, S_in, tv, p_surf, kappa_
                    ((G%mask2dT(i,j) + G%mask2dT(i+1,j+1)) + &
                     (G%mask2dT(i+1,j) + G%mask2dT(i,j+1)) + 1.0e-36 )
 !      h_2d(I,k) = 0.25*((h(i,j,k) + h(i+1,j+1,k)) + (h(i+1,j,k) + h(i,j+1,k)))
-!      h_2d(I,k) = ((h(i,j,k)**2 + h(i+1,j+1,k)**2) + &
-!                   (h(i+1,j,k)**2 + h(i,j+1,k)**2)) * I_hwt
+!      h_2d(I,k) = (((h(i,j,k)**2) + (h(i+1,j+1,k)**2)) + &
+!                   ((h(i+1,j,k)**2) + (h(i,j+1,k)**2))) * I_hwt
     enddo ; enddo
     if (.not.use_temperature) then ; do k=1,nz ; do I=IsB,IeB
       rho_2d(I,k) = GV%Rlay(k)
@@ -1224,12 +1224,12 @@ subroutine calculate_projected_state(kappa, u0, v0, T0, S0, dt, nz, dz, I_dz_int
   ! Store the squared shear at interfaces
   S2(1) = 0.0 ; S2(nz+1) = 0.0
   if (ks > 1) &
-    S2(ks) = ((u(ks)-u0(ks-1))**2 + (v(ks)-v0(ks-1))**2) * (US%L_to_Z*I_dz_int(ks))**2
+    S2(ks) = (((u(ks)-u0(ks-1))**2) + ((v(ks)-v0(ks-1))**2)) * (US%L_to_Z*I_dz_int(ks))**2
   do K=ks+1,ke
-    S2(K) = ((u(k)-u(k-1))**2 + (v(k)-v(k-1))**2) * (US%L_to_Z*I_dz_int(K))**2
+    S2(K) = (((u(k)-u(k-1))**2) + ((v(k)-v(k-1))**2)) * (US%L_to_Z*I_dz_int(K))**2
   enddo
   if (ke<nz) &
-    S2(ke+1) = ((u0(ke+1)-u(ke))**2 + (v0(ke+1)-v(ke))**2) * (US%L_to_Z*I_dz_int(ke+1))**2
+    S2(ke+1) = (((u0(ke+1)-u(ke))**2) + ((v0(ke+1)-v(ke))**2)) * (US%L_to_Z*I_dz_int(ke+1))**2
 
   ! Store the buoyancy frequency at interfaces
   N2(1) = 0.0 ; N2(nz+1) = 0.0