SKaiNET-developers
diff --git a/‎skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt‎
Lines changed: 61 additions & 0 deletions b/‎skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsPowTest.kt‎
Lines changed: 89 additions & 0 deletions b/‎skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsPowTest.kt‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎skainet-compile/skainet-compile-core/src/commonMain/kotlin/sk/ainet/tape/RecordingExecution.kt‎
Lines changed: 15 additions & 0 deletions b/‎skainet-compile/skainet-compile-core/src/commonMain/kotlin/sk/ainet/tape/RecordingExecution.kt‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎skainet-compile/skainet-compile-dag/src/commonMain/kotlin/sk/ainet/lang/graph/DefaultExecutionTape.kt‎
Lines changed: 14 additions & 0 deletions b/‎skainet-compile/skainet-compile-dag/src/commonMain/kotlin/sk/ainet/lang/graph/DefaultExecutionTape.kt‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎skainet-compile/skainet-compile-dag/src/commonTest/kotlin/sk/ainet/compile/graph/ComputeGraphExecutorTest.kt‎
Lines changed: 2 additions & 0 deletions b/‎skainet-compile/skainet-compile-dag/src/commonTest/kotlin/sk/ainet/compile/graph/ComputeGraphExecutorTest.kt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/converters/BasicMathConverter.kt‎
Lines changed: 3 additions & 1 deletion b/‎skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/converters/BasicMathConverter.kt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎skainet-compile/skainet-compile-opt/src/commonMain/kotlin/sk/ainet/compile/opt/GraphOptimizationPipeline.kt‎
Lines changed: 7 additions & 0 deletions b/‎skainet-compile/skainet-compile-opt/src/commonMain/kotlin/sk/ainet/compile/opt/GraphOptimizationPipeline.kt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎skainet-compile/skainet-compile-opt/src/commonMain/kotlin/sk/ainet/compile/opt/passes/PowSpecializationPass.kt‎
Lines changed: 120 additions & 0 deletions b/‎skainet-compile/skainet-compile-opt/src/commonMain/kotlin/sk/ainet/compile/opt/passes/PowSpecializationPass.kt‎
Lines changed: 120 additions & 0 deletions
@@ -12,6 +12,7 @@ import sk.ainet.lang.tensor.data.FloatArrayTensorData
 import sk.ainet.lang.tensor.data.TensorDataFactory
 import sk.ainet.lang.tensor.ops.UpsampleMode
 import sk.ainet.lang.types.FP32
+import kotlin.math.pow
 import kotlin.math.sqrt
 
 @Backend(id = "cpu", displayName = "CPU")
@@ -2123,6 +2124,66 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
         return newTensor(outData, tensor.dtype, tensor)
     }
 
+    /**
+     * Element-wise power: `c[i] = a[i] ^ b[i]`. Integer-valued exponents
+     * use repeated multiply for stability; everything else routes through
+     * `kotlin.math.pow`. Shape contract: shapes must match exactly (no
+     * broadcasting yet — caller's responsibility).
+     */
+    override fun <T : DType, V> pow(a: Tensor<T, V>, b: Tensor<T, V>): Tensor<T, V> {
+        require(
+            a.dtype == sk.ainet.lang.types.FP32::class ||
+                a.dtype == sk.ainet.lang.types.FP16::class
+        ) { "pow supports only FP16/FP32, got ${a.dtype}" }
+        require(a.shape == b.shape) { "pow requires matching shapes; got ${a.shape} and ${b.shape}" }
+        val outData = dataFactory.init<T, V>(a.shape, a.dtype) { idx ->
+            val av = a.data.get(*idx) as Float
+            val bv = b.data.get(*idx) as Float
+            @Suppress("UNCHECKED_CAST")
+            scalarPow(av, bv) as V
+        }
+        return newTensor(outData, a.dtype, a)
+    }
+
+    /**
+     * Element-wise scalar power: `c[i] = a[i] ^ n`. Small-integer
+     * exponents (|n| <= 16) use repeated multiply for exactness; all
+     * other values route through `kotlin.math.pow`.
+     */
+    override fun <T : DType, V> powScalar(a: Tensor<T, V>, n: Number): Tensor<T, V> {
+        require(
+            a.dtype == sk.ainet.lang.types.FP32::class ||
+                a.dtype == sk.ainet.lang.types.FP16::class
+        ) { "powScalar supports only FP16/FP32, got ${a.dtype}" }
+        val nFloat = n.toFloat()
+        val nInt = n.toInt()
+        val isSmallInt = nFloat == nInt.toFloat() && kotlin.math.abs(nInt) <= 16
+        val outData = dataFactory.init<T, V>(a.shape, a.dtype) { idx ->
+            val av = a.data.get(*idx) as Float
+            @Suppress("UNCHECKED_CAST")
+            (if (isSmallInt) integerPow(av, nInt) else scalarPow(av, nFloat)) as V
+        }
+        return newTensor(outData, a.dtype, a)
+    }
+
+    /** Repeated-multiply for small integer exponents. Handles n < 0 via reciprocal. */
+    private fun integerPow(base: Float, n: Int): Float {
+        if (n == 0) return 1f
+        if (n < 0) return 1f / integerPow(base, -n)
+        var result = 1f
+        var b = base
+        var e = n
+        while (e > 0) {
+            if (e and 1 == 1) result *= b
+            b *= b
+            e = e ushr 1
+        }
+        return result
+    }
+
+    private fun scalarPow(base: Float, exp: Float): Float =
+        base.toDouble().pow(exp.toDouble()).toFloat()
+
     // ---- TinyFoA ops: abs, sign, clamp, lt, ge ----
 
     @TensorOp()
 
@@ -0,0 +1,89 @@
+package sk.ainet.exec.tensor.ops
+
+import kotlin.math.abs
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFailsWith
+import kotlin.test.assertTrue
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.VoidOpsTensor
+import sk.ainet.lang.tensor.data.DenseTensorDataFactory
+import sk.ainet.lang.tensor.data.FloatArrayTensorData
+import sk.ainet.lang.types.FP32
+
+/**
+ * Forward-parity tests for the new `pow` and `powScalar` ops (Tier A
+ * of #617). Checks both the binary form (tensor exponent) and the
+ * scalar form for integer + real exponents.
+ */
+class DefaultCpuOpsPowTest {
+    private val dataFactory = DenseTensorDataFactory()
+    private val ops = DefaultCpuOps(dataFactory)
+
+    private fun floatTensor(shape: Shape, values: FloatArray) =
+        VoidOpsTensor(dataFactory.fromFloatArray<FP32, Float>(shape, FP32::class, values), FP32::class)
+
+    private fun assertCloseTo(expected: FloatArray, actual: FloatArray, tol: Float = 1e-4f) {
+        assertEquals(expected.size, actual.size, "length mismatch")
+        for (i in expected.indices) {
+            val diff = abs(expected[i] - actual[i])
+            assertTrue(diff <= tol, "[$i] expected=${expected[i]} actual=${actual[i]} diff=$diff tol=$tol")
+        }
+    }
+
+    @Test
+    fun powScalar_integer_2_matches_x_times_x() {
+        val a = floatTensor(Shape(5), floatArrayOf(0.5f, 1f, 2f, 3f, -2f))
+        val expected = floatArrayOf(0.25f, 1f, 4f, 9f, 4f)
+        val out = ops.powScalar(a, 2)
+        assertCloseTo(expected, (out.data as FloatArrayTensorData<*>).buffer)
+    }
+
+    @Test
+    fun powScalar_integer_3_matches_x_cubed() {
+        val a = floatTensor(Shape(4), floatArrayOf(1f, 2f, 3f, -2f))
+        val expected = floatArrayOf(1f, 8f, 27f, -8f)
+        val out = ops.powScalar(a, 3)
+        assertCloseTo(expected, (out.data as FloatArrayTensorData<*>).buffer)
+    }
+
+    @Test
+    fun powScalar_negative_integer_minus_1_is_reciprocal() {
+        val a = floatTensor(Shape(3), floatArrayOf(2f, 4f, 0.5f))
+        val expected = floatArrayOf(0.5f, 0.25f, 2f)
+        val out = ops.powScalar(a, -1)
+        assertCloseTo(expected, (out.data as FloatArrayTensorData<*>).buffer)
+    }
+
+    @Test
+    fun powScalar_real_half_is_sqrt() {
+        val a = floatTensor(Shape(4), floatArrayOf(0f, 1f, 4f, 9f))
+        val expected = floatArrayOf(0f, 1f, 2f, 3f)
+        val out = ops.powScalar(a, 0.5f)
+        assertCloseTo(expected, (out.data as FloatArrayTensorData<*>).buffer)
+    }
+
+    @Test
+    fun powScalar_real_1_5_matches_kotlin_math_pow() {
+        val a = floatTensor(Shape(3), floatArrayOf(1f, 2f, 4f))
+        val expected = floatArrayOf(1f, 2.828427f, 8f)
+        val out = ops.powScalar(a, 1.5f)
+        assertCloseTo(expected, (out.data as FloatArrayTensorData<*>).buffer)
+    }
+
+    @Test
+    fun pow_binary_element_wise() {
+        val a = floatTensor(Shape(4), floatArrayOf(2f, 3f, 4f, 5f))
+        val b = floatTensor(Shape(4), floatArrayOf(2f, 3f, 0.5f, 1f))
+        val expected = floatArrayOf(4f, 27f, 2f, 5f)
+        val out = ops.pow(a, b)
+        assertCloseTo(expected, (out.data as FloatArrayTensorData<*>).buffer)
+    }
+
+    @Test
+    fun pow_binary_rejects_shape_mismatch() {
+        val a = floatTensor(Shape(3), floatArrayOf(1f, 2f, 3f))
+        val b = floatTensor(Shape(4), floatArrayOf(1f, 2f, 3f, 4f))
+        assertFailsWith<IllegalArgumentException> { ops.pow(a, b) }
+    }
+}
@@ -184,6 +184,21 @@ internal class RecordingTensorOpsDecorator(private val base: TensorOps) : Tensor
         return out
     }
 
+    // --- Power ops ---
+    override fun <T : DType, V> pow(a: Tensor<T, V>, b: Tensor<T, V>): Tensor<T, V> {
+        val out = base.pow(a, b)
+        record(PowOperation<T, V>(), listOf(a, b), listOf(out))
+        return out
+    }
+
+    override fun <T : DType, V> powScalar(a: Tensor<T, V>, n: Number): Tensor<T, V> {
+        val out = base.powScalar(a, n)
+        // Single-input + scalar exponent stashed in parameters so the
+        // backward formula can recover it (a-partial is n * a^(n-1)).
+        record(PowOperation<T, V>(parameters = mapOf("scalar_exponent" to n)), listOf(a), listOf(out))
+        return out
+    }
+
     // --- Scalar ops ---
     override fun <T : DType, V> addScalar(a: Tensor<T, V>, b: Number): Tensor<T, V> {
         val out = base.addScalar(a, b)
 
@@ -643,6 +643,20 @@ public class DefaultGradientTape(
         return listOf(null, null, null)
     }
 
+    override fun powBackward(upstream: Tensor<DType, Any>, output: Tensor<DType, Any>, inputs: List<Tensor<DType, Any>>, attributes: Map<String, Any?>): List<Tensor<DType, Any>?> {
+        // Backward for pow(a, b): da = b*a^(b-1)*upstream, db = a^b*log(a)*upstream.
+        // Needs `log` op (Tier B of #617) for the db partial.
+        // First-cut Tier A stub: return null for both partials. Real formula lands in Tier C.
+        return listOf(null, null)
+    }
+
+    override fun powScalarBackward(upstream: Tensor<DType, Any>, output: Tensor<DType, Any>, inputs: List<Tensor<DType, Any>>, attributes: Map<String, Any?>): List<Tensor<DType, Any>?> {
+        // Backward for powScalar(a, n): da = n*a^(n-1)*upstream.
+        // Self-contained (no log needed) — but defer the formula to Tier C
+        // alongside the rest of the autograd completeness work.
+        return listOf(null)
+    }
+
     override fun conv2dBackward(upstream: Tensor<DType, Any>, output: Tensor<DType, Any>, inputs: List<Tensor<DType, Any>>, attributes: Map<String, Any?>): List<Tensor<DType, Any>?> {
         // d(conv2d(x, w, b))/dx, d(conv2d(x, w, b))/dw, d(conv2d(x, w, b))/db
         // This is complex and usually implemented in the backend.
 
@@ -177,6 +177,8 @@ private class TestTensorOps : TensorOps {
     override fun <T : DType, V> mean(tensor: Tensor<T, V>, dim: Int?): Tensor<T, V> = tensor
     override fun <T : DType, V> variance(tensor: Tensor<T, V>, dim: Int?): Tensor<T, V> = tensor
     override fun <T : DType, V> sqrt(tensor: Tensor<T, V>): Tensor<T, V> = tensor
+    override fun <T : DType, V> pow(a: Tensor<T, V>, b: Tensor<T, V>): Tensor<T, V> = a
+    override fun <T : DType, V> powScalar(a: Tensor<T, V>, n: Number): Tensor<T, V> = a
     override fun <T : DType, V> abs(tensor: Tensor<T, V>): Tensor<T, V> = tensor
     override fun <T : DType, V> sign(tensor: Tensor<T, V>): Tensor<T, V> = tensor
     override fun <T : DType, V> clamp(tensor: Tensor<T, V>, minVal: Float, maxVal: Float): Tensor<T, V> = tensor
 
@@ -22,7 +22,8 @@ public class BasicMathConverter : StableHloOperationConverter {
 
     override val supportedOperations: Set<String> = setOf(
         "add", "subtract", "multiply", "divide",
-        "sub", "mul", "div" // Common aliases
+        "sub", "mul", "div", // Common aliases
+        "pow"
     )
 
     override fun convert(
@@ -101,6 +102,7 @@ public class BasicMathConverter : StableHloOperationConverter {
             "subtract", "sub" -> "stablehlo.subtract"
             "multiply", "mul" -> "stablehlo.multiply"
             "divide", "div" -> "stablehlo.divide"
+            "pow" -> "stablehlo.power"
             else -> null
         }
     }
 
@@ -6,6 +6,7 @@ import sk.ainet.compile.opt.passes.DTypeConstraintResolutionPass
 import sk.ainet.compile.opt.passes.DeadCodeEliminationPass
 import sk.ainet.compile.opt.passes.LLMFusionPass
 import sk.ainet.compile.opt.passes.OperationFusionPass
+import sk.ainet.compile.opt.passes.PowSpecializationPass
 import sk.ainet.compile.opt.passes.SharedWeightDeduplicationPass
 import sk.ainet.compile.opt.passes.TransposeEliminationPass
 
@@ -80,6 +81,11 @@ public class GraphOptimizationPipeline(
                 // is the boundary where dtype problems surface — every
                 // later pass can assume dtype-validity.
                 DTypeConstraintResolutionPass(),
+                // Rewrite pow(x, 2) to multiply(x, x) before fusion so
+                // the downstream passes see the multiply form. Runs after
+                // dtype resolution (still benefits from resolved dtypes)
+                // and before everything else.
+                PowSpecializationPass(),
                 DeadCodeEliminationPass(),
                 ConstantFoldingPass(),
                 OperationFusionPass()
@@ -92,6 +98,7 @@ public class GraphOptimizationPipeline(
         public fun createAggressive(): GraphOptimizationPipeline = GraphOptimizationPipeline(
             passes = listOf(
                 DTypeConstraintResolutionPass(),
+                PowSpecializationPass(),
                 DeadCodeEliminationPass(),
                 ConstantFoldingPass(),
                 OperationFusionPass()
 
@@ -0,0 +1,120 @@
+package sk.ainet.compile.opt.passes
+
+import sk.ainet.compile.opt.GraphOptimizationPass
+import sk.ainet.compile.opt.GraphOptimizationResult
+import sk.ainet.lang.graph.ComputeGraph
+import sk.ainet.lang.graph.GraphEdge
+import sk.ainet.lang.graph.GraphNode
+import sk.ainet.lang.tensor.ops.MultiplyOperation
+import sk.ainet.lang.tensor.ops.PowOperation
+
+/**
+ * Rewrites `powScalar(x, n)` for small integer `n` (currently `n == 2`)
+ * into the equivalent `multiply(x, x)` chain. The downstream multiply
+ * dispatch routes to the matmul / SIMD elementwise kernels — much
+ * cheaper than a real `pow` per element.
+ *
+ * Pattern detected:
+ * ```
+ *   PowOperation node with parameters["scalar_exponent"] == 2 and one input
+ * ```
+ * Replaced with:
+ * ```
+ *   MultiplyOperation node with both inputs wired to the original input
+ * ```
+ *
+ * Wider integer exponents (n = 3, 4, ...) intentionally not handled in
+ * this first cut — each adds one more layer of multiplies and the
+ * register-pressure / staging trade-off isn't obvious without a
+ * benchmark. Add them when there's a workload that wants them.
+ */
+public class PowSpecializationPass : GraphOptimizationPass {
+
+    override val name: String = "pow-specialization"
+
+    override fun apply(graph: ComputeGraph): GraphOptimizationResult {
+        val diagnostics = mutableListOf<String>()
+        var changed = false
+
+        // Snapshot nodes — we mutate the graph inside the loop.
+        val candidates = graph.nodes.filter { node ->
+            node.operation is PowOperation<*, *> &&
+                node.inputs.size == 1 &&
+                exponentInt(node) == 2
+        }
+
+        for (powNode in candidates) {
+            val producer = graph.edges.firstOrNull { it.destination.id == powNode.id }
+                ?: continue
+            val sourceNode = producer.source
+
+            // Build the replacement multiply node — same id so consumer
+            // edges that target powNode.id continue to resolve.
+            val mul = GraphNode(
+                id = powNode.id,
+                operation = MultiplyOperation<sk.ainet.lang.types.DType, Any>(),
+                inputs = listOf(powNode.inputs[0], powNode.inputs[0]),
+                outputs = powNode.outputs,
+                metadata = powNode.metadata,
+            )
+
+            // Snapshot edges before mutating.
+            val incomingToPow = graph.edges.filter { it.destination.id == powNode.id }
+            val outgoingFromPow = graph.edges.filter { it.source.id == powNode.id }
+
+            graph.removeNode(powNode)
+            graph.addNode(mul)
+
+            // Wire both multiply inputs to the original x.
+            for (i in 0..1) {
+                graph.addEdge(
+                    GraphEdge(
+                        id = "e_${sourceNode.id}_${producer.sourceOutputIndex}__${mul.id}_$i",
+                        source = sourceNode,
+                        destination = mul,
+                        sourceOutputIndex = producer.sourceOutputIndex,
+                        destinationInputIndex = i,
+                        tensorSpec = producer.tensorSpec,
+                    ),
+                )
+            }
+
+            // Restore the outgoing edges to the new node.
+            for (edge in outgoingFromPow) {
+                graph.addEdge(
+                    GraphEdge(
+                        id = edge.id,
+                        source = mul,
+                        destination = edge.destination,
+                        sourceOutputIndex = edge.sourceOutputIndex,
+                        destinationInputIndex = edge.destinationInputIndex,
+                        tensorSpec = edge.tensorSpec,
+                    ),
+                )
+            }
+
+            // The old incoming edge to the (removed) pow node should be
+            // cleaned up — removeNode usually does this, but defensively
+            // remove the producer edge if it survived.
+            for (edge in incomingToPow) {
+                graph.removeEdge(edge)
+            }
+
+            diagnostics += "Specialized pow(${sourceNode.id}, 2) -> multiply at node ${powNode.id}"
+            changed = true
+        }
+
+        return GraphOptimizationResult(graph, changed = changed, diagnostics = diagnostics)
+    }
+
+    /**
+     * Returns the integer exponent stashed in [PowOperation.parameters]
+     * (under `"scalar_exponent"`), or `null` if absent / non-integer.
+     */
+    private fun exponentInt(node: GraphNode): Int? {
+        val raw = node.operation.parameters["scalar_exponent"] ?: return null
+        val n = (raw as? Number)?.toDouble() ?: return null
+        val asInt = n.toInt()
+        return if (n == asInt.toDouble()) asInt else null
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,8 @@ public class BasicMathConverter : StableHloOperationConverter {`
`22`	`22`
`23`	`23`	`override val supportedOperations: Set<String> = setOf(`
`24`	`24`	`"add", "subtract", "multiply", "divide",`
`25`		`- "sub", "mul", "div" // Common aliases`
	`25`	`+ "sub", "mul", "div", // Common aliases`
	`26`	`+ "pow"`
`26`	`27`	`)`
`27`	`28`
`28`	`29`	`override fun convert(`
`@@ -101,6 +102,7 @@ public class BasicMathConverter : StableHloOperationConverter {`
`101`	`102`	`"subtract", "sub" -> "stablehlo.subtract"`
`102`	`103`	`"multiply", "mul" -> "stablehlo.multiply"`
`103`	`104`	`"divide", "div" -> "stablehlo.divide"`
	`105`	`+ "pow" -> "stablehlo.power"`
`104`	`106`	`else -> null`
`105`	`107`	`}`
`106`	`108`	`}`