more simd

Refefer · Refefer · commit 213331be514d · 2025-02-14T14:25:02.000-05:00
diff --git a/src/graph.rs b/src/graph.rs
@@ -81,7 +81,6 @@ impl Graph {
             Entry::Vacant(entry) => {
                 let mut v = allocate_vec(grad.len());
                 v[..].clone_from_slice(grad);
-
                 entry.insert(v);
             }
         }
diff --git a/src/lib.rs b/src/lib.rs
@@ -380,3 +380,17 @@ impl MinimumOps for ANode {
 convert_binops!    { impl MinimumOps, minimum for ANode, ANode }
 forward_ref_binop! { impl MinimumOps, minimum for ANode, ANode }
 
+pub trait UnaryOps {
+    fn sqrt(self) -> ANode;
+    fn sin(self) -> ANode;
+}
+
+impl UnaryOps for ANode {
+    fn sqrt(self) -> ANode {
+        SquareRoot::new(self)
+    }
+    fn sin(self) -> ANode {
+        Sin::new(self)
+    }
+
+}
diff --git a/src/ops.rs b/src/ops.rs
@@ -272,9 +272,9 @@ macro_rules! run_unary_op {
         if left_len == out_len {
             $func(ArrayInput($left), ArrayOutput($out));
         } else if left_len == 1 {
-            $func(BroadcastInput($left[0], out_len), ArrayOutput($out));
+            $func(BroadcastInput($left, out_len), ArrayOutput($out));
         } else if out_len == 1 {
-            $func(ArrayInput($left[0], out_len), BroadcastOutput($out:tt, left_len));
+            $func(ArrayInput($left), BroadcastOutput($out, left_len));
         } else {
             panic!("Left length: {}, Output Length: {}", left_len, out_len);
         }
@@ -403,11 +403,11 @@ impl Subtract {
     }
 
     fn compute(left: &ANode, right: &ANode) -> MPVec {
-        let (lv, rv) = Broadcast::from_pair(left.value(), right.value());
-        let mut out = allocate_vec(lv.len);
-        out.iter_mut().zip(lv.zip(rv)).for_each(|(oi, (lvi, rvi))| {
-            *oi = lvi - rvi
-        });
+        let x = left.value();
+        let y = right.value();
+        let mut out = Broadcast::allocate_out(x, y);
+        let o = &mut out;
+        run_binary_op!(x, y, o, simd_sub);
         out
     }
 }
@@ -431,12 +431,12 @@ impl Node for Subtract {
     fn compute_grad(&self, grad: &[DType], child_grads: &mut [&mut [DType]]) {
         // f(x,y) = x - y
         // df(x,y)/dx = 1
-        // df(x,y)/dy = -1
-        let mut out = Updater::new(&mut child_grads[0], grad.len());
-        grad.iter().for_each(|gi| out.add(*gi));
+        let out = &mut child_grads[0];
+        run_unary_op!(grad, out, simd_iadd);
 
-        let mut out = Updater::new(&mut child_grads[1], grad.len());
-        grad.iter().for_each(|gi| out.add(-*gi));
+        // df(x,y)/dy = -1
+        let out = &mut child_grads[1];
+        run_unary_op!(grad, out, grad_sub_y);
     }
 
 }
@@ -539,21 +539,10 @@ impl Node for Divide {
         let out = &mut child_grads[0];
         run_binary_op!(grad, y, out, grad_div_x);
 
-        /*
-        let ly  = Broadcast::sized(y, child_grads[0].len());
-        let mut out = Updater::new(&mut child_grads[0], grad.len());
-        grad.iter().zip(ly).for_each(|(gi, yi)| out.add(*gi / *yi));
-        */
-
         let out = &mut child_grads[1];
+        // df(x,y)/dy = -x / y ^ 2
         run_trinary_op!(grad, x, y, out, grad_div_y);
 
-        // df(x,y)/dy = -x / y ^ 2
-        /*
-        let (lx, ly) = Broadcast::from_pair(x, y);
-        let mut out = Updater::new(&mut child_grads[1], lx.len);
-        grad.iter().zip(lx.zip(ly)).for_each(|(gi, (xi, yi))| out.add(*gi * -*xi / yi.powf(2f32)));
-        */
     }
 
 }
@@ -664,7 +653,7 @@ impl Node for SquareRoot {
     fn requires_grad(&self) -> bool { false }
 
     fn compute_grad(&self, grad: &[DType], child_grads: &mut [&mut [DType]]) {
-        let x = self.1[0].value();
+        let x = self.value();
 
         // df(x)/dx = (1/2) / x ^ 0.5
         child_grads[0].iter_mut().zip(grad.iter().zip(x)).for_each(|(outi, (gi, xi))| {
@@ -905,7 +894,9 @@ impl Exp {
     fn compute(left: &ANode) -> MPVec {
         let lv = left.value();
         let mut out = allocate_vec(lv.len());
-        out.iter_mut().zip(lv.iter()).for_each(|(oi, lvi)| *oi = lvi.exp());
+        let o = &mut out;
+        run_unary_op!(lv, o, simd_exp);
+        //out.iter_mut().zip(lv.iter()).for_each(|(oi, lvi)| *oi = lvi.exp());
         out
     }
 
@@ -1314,6 +1305,21 @@ mod tests {
         assert_eq!(y_grad, &[3.]);
     }
 
+    #[test]
+    fn test_sqrt() {
+        let x = Variable::new(vec![4., 9.]);
+        let res = SquareRoot::new(x.clone());
+        assert_eq!(res.value(), &[2., 3.]);
+
+        let mut graph = Graph::new();
+        graph.backward(&res);
+
+        let x_1_g = 1f32 / (2f32 * 2f32);
+        let x_2_g = 1f32 / (2f32 * 3f32);
+        let x_grad = graph.get_grad(&x).unwrap();
+        assert_eq!(x_grad, &[x_1_g, x_2_g]);
+    }
+
     #[test]
     fn test_div() {
         let x = Variable::new(vec![0., 1.]);
@@ -1466,7 +1472,7 @@ mod tests {
         let x = Variable::new(vec![1., 2., 3.]);
 
         let x_slice = x.slice(1, 2);
-        let mut out = x_slice * 2.;
+        let out = x_slice * 2.;
 
         let mut graph = Graph::new();
         graph.backward(&out);
diff --git a/src/vecops.rs b/src/vecops.rs
@@ -168,6 +168,55 @@ unsafe fn hsum_avx_ps(v: __m256) -> f32 {
     _mm_cvtss_f32(sum_128)
 }
 
+#[inline(always)]
+pub unsafe fn _mm256_exp_ps(x: __m256) -> __m256 {
+    // Constants
+    let ln2      = _mm256_set1_ps(0.6931471805599453);        // ln(2)
+    let ln2_inv  = _mm256_set1_ps(1.4426950408889634);        // 1/ln(2)
+    
+    // Scale input by 1/ln(2)
+    let scaled   = _mm256_mul_ps(x, ln2_inv);
+    // Round scaled value to nearest integer: n = round(x/ln2)
+    let n        = _mm256_round_ps(scaled, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    // r = x - n*ln2
+    let r        = _mm256_sub_ps(x, _mm256_mul_ps(n, ln2));
+
+    // Compute a polynomial approximation of exp(r):
+    // exp(r) ~ 1 + r + r²/2 + r³/6 + r⁴/24
+    let c2   = _mm256_set1_ps(1.0);              // coefficient for r
+    let c3   = _mm256_set1_ps(0.5);              // coefficient for r²: 1/2
+    let c4   = _mm256_set1_ps(0.16666667);       // coefficient for r³: 1/6
+    let c5   = _mm256_set1_ps(0.04166667);       // coefficient for r⁴: 1/24
+
+    let r2 = _mm256_mul_ps(r, r);
+    let r3 = _mm256_mul_ps(r2, r);
+    let r4 = _mm256_mul_ps(r3, r);
+
+    let poly = _mm256_add_ps(
+                    _mm256_add_ps(
+                        _mm256_add_ps(
+                            _mm256_add_ps(r, c2),
+                            _mm256_mul_ps(r2, c3)
+                        ),
+                        _mm256_mul_ps(r3, c4)
+                    ),
+                    _mm256_mul_ps(r4, c5)
+                );
+    
+    // Compute 2^n using IEEE754 bit-level conversion:
+    // First, convert n (float) to an integer
+    let int_n = _mm256_cvtps_epi32(n);
+    // For a 32-bit float, the exponent field is biased by 127.
+    // So 2^n is represented by (n + 127) << 23.
+    let bias = _mm256_set1_epi32(127);
+    let exp_int = _mm256_add_epi32(int_n, bias);
+    let exp_int = _mm256_slli_epi32(exp_int, 23);
+    let two_n = _mm256_castsi256_ps(exp_int);
+    
+    // Reconstruct exp(x) ≈ exp(r) * 2^n
+    _mm256_mul_ps(poly, two_n)
+}
+
 macro_rules! avx_detect {
     ($block:expr) => {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -183,7 +232,7 @@ macro_rules! avx_detect {
 
 macro_rules! unary_op {
     ($fname:ident, $sim_op:expr, $fallback_op:expr) => {
-        pub unsafe fn $fname(
+        pub fn $fname(
             a: impl Input,
             mut out: impl Output
         ) {
@@ -194,12 +243,14 @@ macro_rules! unary_op {
             let mut i = 0;
 
             avx_detect! {
-                // Process in chunks of 8 floats
-                while i + 8 <= length {
-                    let va = a.fill_256(i);
-                    let res = $sim_op(va);
-                    out.store(res, i);
-                    i += 8;
+                unsafe {
+                    // Process in chunks of 8 floats
+                    while i + 8 <= length {
+                        let va = a.fill_256(i);
+                        let res = $sim_op(va);
+                        out.store(res, i);
+                        i += 8;
+                    }
                 }
             }
 
@@ -215,7 +266,7 @@ macro_rules! unary_op {
 
 macro_rules! binary_op {
     ($fname:ident, $sim_op:expr, $(&mut)? $fallback_op:expr) => {
-        pub unsafe fn $fname(
+        pub fn $fname(
             a: impl Input,
             b: impl Input,
             mut out: impl Output
@@ -228,13 +279,15 @@ macro_rules! binary_op {
             let mut i = 0;
 
             avx_detect! {
-                // Process in chunks of 8 floats
-                while i + 8 <= length {
-                    let va = a.fill_256(i);
-                    let vb = b.fill_256(i);
-                    let res = $sim_op(va, vb);
-                    out.store(res, i);
-                    i += 8;
+                unsafe {
+                    // Process in chunks of 8 floats
+                    while i + 8 <= length {
+                        let va = a.fill_256(i);
+                        let vb = b.fill_256(i);
+                        let res = $sim_op(va, vb);
+                        out.store(res, i);
+                        i += 8;
+                    }
                 }
             }
 
@@ -250,7 +303,7 @@ macro_rules! binary_op {
 
 macro_rules! trinary_op {
     ($fname:ident, $sim_op:expr, $fallback_op:expr) => {
-        pub unsafe fn $fname(
+        pub fn $fname(
             a: impl Input,
             b: impl Input,
             c: impl Input,
@@ -265,14 +318,16 @@ macro_rules! trinary_op {
             let mut i = 0;
 
             avx_detect! {
-                // Process in chunks of 8 floats
-                while i + 8 <= length {
-                    let va = a.fill_256(i);
-                    let vb = b.fill_256(i);
-                    let vc = c.fill_256(i);
-                    let res = $sim_op(va, vb, vc);
-                    out.store(res, i);
-                    i += 8;
+                unsafe {
+                    // Process in chunks of 8 floats
+                    while i + 8 <= length {
+                        let va = a.fill_256(i);
+                        let vb = b.fill_256(i);
+                        let vc = c.fill_256(i);
+                        let res = $sim_op(va, vb, vc);
+                        out.store(res, i);
+                        i += 8;
+                    }
                 }
             }
 
@@ -317,3 +372,33 @@ binary_op!(
     |xi, yi| { xi * yi }
 );
 
+binary_op!(
+    simd_sub,
+    _mm256_sub_ps,
+    |xi, yi| { xi - yi }
+);
+
+binary_op!(
+    simd_add,
+    _mm256_add_ps,
+    |xi, yi| { xi - yi }
+);
+
+unary_op!(
+    simd_iadd,
+    |vo| {vo},
+    |xi| {xi}
+);
+
+unary_op!(
+    grad_sub_y,
+    |vo| { _mm256_xor_ps(vo, _mm256_set1_ps(-0f32))},
+    |xi: f32| {-xi}
+);
+
+unary_op!(
+    simd_exp,
+    _mm256_exp_ps,
+    |xi: f32| {xi.exp()}
+);
+

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,6 @@ impl Graph {`
`81`	`81`	`Entry::Vacant(entry) => {`
`82`	`82`	`let mut v = allocate_vec(grad.len());`
`83`	`83`	`v[..].clone_from_slice(grad);`
`84`		`-`
`85`	`84`	`entry.insert(v);`
`86`	`85`	`}`
`87`	`86`	`}`