basalt-org · andresnowak · May 14, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/basalt/__init__.mojo b/basalt/__init__.mojo
@@ -5,3 +5,4 @@ from basalt.utils.collection import Collection
 alias dtype = DType.float32
 alias nelts = 2 * simdwidthof[dtype]()
 alias seed = 42
+alias epsilon = 1e-12
diff --git a/basalt/autograd/attributes.mojo b/basalt/autograd/attributes.mojo
@@ -1,4 +1,5 @@
 from collections import Optional, OptionalReg
+from utils.static_tuple import StaticTuple
 
 from basalt.nn.tensor import Tensor, TensorShape, MAX_RANK
 from basalt.utils.bytes import Bytes, scalar_to_bytes, bytes_to_scalar
@@ -45,9 +46,8 @@ struct AttributeVector(Sized, Stringable, CollectionElement):
     var attributes: StaticTuple[Attribute, MAX_ATTRS]
     var size: Int
 
-    @always_inline("nodebug")
     fn __init__(inout self, *attributes: Attribute):
-        self.attributes = StaticTuple[Attribute, MAX_ATTRS]()
+        self.attributes = StaticTuple[Attribute, MAX_ATTRS](Attribute("", ""))
         self.size = len(attributes)
         for i in range(self.size):
             self.attributes[i] = attributes[i]
@@ -67,7 +67,10 @@ struct AttributeVector(Sized, Stringable, CollectionElement):
                 return self.attributes[i]
         return None
 
-    @always_inline("nodebug")
+    fn append(inout self, attribute: Attribute):
+        self.attributes[self.size] = attribute
+        self.size += 1
+
     fn __str__(self) -> String:
         var s: String = "["
         for i in range(self.size):
@@ -85,15 +88,13 @@ struct Attribute(Stringable, CollectionElement):
     var type: AttributeType
     var size: Int
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: String):
         self.data_shape = StaticIntTuple[MAX_RANK]()
         self.name = Bytes[MAX_NAME_CHARS](name)
         self.data = Bytes[MAX_DATA_BYTES](value)
         self.type = AttributeType.STRING
         self.size = len(value)
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: TensorShape):
         self.data_shape = StaticIntTuple[MAX_RANK]()
         self.name = Bytes[MAX_NAME_CHARS](name)
@@ -104,7 +105,6 @@ struct Attribute(Stringable, CollectionElement):
         for i in range(self.size):
             self.data_shape[i] = value._shape[i]
 
-    @always_inline("nodebug")
     fn __init__[N: Int](inout self, name: String, value: StaticIntTuple[N]):
         constrained[N < MAX_RANK, "Attribute rank must be less than MAX_RANK."]()
 
@@ -115,9 +115,8 @@ struct Attribute(Stringable, CollectionElement):
         self.size = N
 
         for i in range(self.size):
-            self.data[i] = value[i]
+            self.data_shape[i] = value[i]
 
-    @always_inline("nodebug")
     fn __init__[dtype: DType](inout self, name: String, value: Scalar[dtype]):
         constrained[dtype.is_numeric(), "Attribute value must be numeric."]()
 
@@ -127,46 +126,38 @@ struct Attribute(Stringable, CollectionElement):
         self.type = AttributeType(dtype)
         self.size = 1
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: Int):
         self.__init__(name, Int64(value))
         self.data_shape[0] = 1
 
-    @always_inline("nodebug")
     fn __init__(inout self, name: String, value: FloatLiteral):
         self.__init__(name, Float64(value))
         self.data_shape[0] = 1
 
-    @always_inline("nodebug")
     fn __str__(self) -> String:
         return "Attribute(" + str(self.name) + ", " + "..." + ")"
 
-    @always_inline("nodebug")
     fn to_string(self) -> String:
         return str(self.data)
 
-    @always_inline("nodebug")
     fn to_shape(self) -> TensorShape:
         return TensorShape(rank=self.size, shape=self.data_shape)
 
-    @always_inline("nodebug")
     fn to_static[N: Int](self) -> StaticIntTuple[N]:
         constrained[N < MAX_RANK, "Attribute rank must be less than MAX_RANK."]()
 
         var result = StaticIntTuple[N]()
 
         for i in range(N):
-            result[i] = int(self.data[i])
+            result[i] = int(self.data_shape[i])
 
         return result
 
-    @always_inline("nodebug")
     fn to_scalar[dtype: DType](self) -> Scalar[dtype]:
         constrained[dtype.is_numeric(), "Attribute value must be numeric."]()
 
         return bytes_to_scalar[dtype](self.data)
 
-    @always_inline("nodebug")
     fn to_int(self) -> Int:
         return int(self.to_scalar[DType.int64]())
 

diff --git a/basalt/autograd/graph.mojo b/basalt/autograd/graph.mojo
@@ -11,7 +11,6 @@ from basalt import seed, dtype
 from basalt import Tensor, TensorShape
 
 
-@value
 struct Graph:
     var inputs: List[Symbol]
     var params: ParamDict
@@ -28,41 +27,42 @@ struct Graph:
         self.loss_out = None
         self.symbol_count = 0
 
-    fn input(inout self, shape: TensorShape, trainable: Bool = False) -> Symbol:
-        var inp = Symbol(self.symbol_count, dtype, shape, trainable)
-        self.inputs.append(inp)
-        self.symbol_count += 1
-        return inp
+    fn __moveinit__(inout self, owned other: Graph):
+        self.inputs = other.inputs^
+        self.params = other.params^
+        self.nodes = other.nodes^
+        self.outputs = other.outputs^
+        self.loss_out = other.loss_out
+        self.symbol_count = other.symbol_count
 
-    fn param(
-        inout self, shape: TensorShape, init: Param, trainable: Bool = True
-    ) -> Symbol:
-        var param_id = Symbol(self.symbol_count, dtype, shape, trainable)
-        self.params.put(param_id, init)
+    fn create_symbol(inout self, shape: TensorShape, data: Optional[Param] = None, trainable: Bool = False, is_input: Bool = False) -> Symbol:
+        var symbol = Symbol(self.symbol_count, dtype, shape, trainable)
         self.symbol_count += 1
-        return param_id
+
+        if is_input:
+            self.inputs.append(symbol)
+        else:
+            if data is not None:
+                self.params.put(symbol, data.value()[])
+            else:
+                self.params.put(symbol)
+
+        return symbol
+
+    fn input(inout self, shape: TensorShape, trainable: Bool = False) -> Symbol:
+        return self.create_symbol(shape, trainable=trainable, is_input=True)
+
+    fn param(inout self, shape: TensorShape, init: Param, trainable: Bool = True) -> Symbol:
+        return self.create_symbol(shape, init, trainable)
 
     fn param(inout self, shape: TensorShape, trainable: Bool = True) -> Symbol:
-        var param_id = Symbol(self.symbol_count, dtype, shape, trainable)
-        self.params.put(param_id)
-        self.symbol_count += 1
-        return param_id
+        return self.create_symbol(shape, trainable=trainable)
 
     fn scalar(inout self, value: Scalar[dtype]) -> Symbol:
-        var scal = Param(value)
-        var scalar_id = Symbol(
-            self.symbol_count, dtype, TensorShape(1), trainable=False
-        )
-        self.params.put(scalar_id, scal)
-        self.symbol_count += 1
-        return scalar_id
+        return self.create_symbol(TensorShape(1), Param(value), trainable=False)
 
     fn constant(inout self, shape: TensorShape, data: List[Scalar[dtype]]) -> Symbol:
-        var cst = Param(data)
-        var constant_id = Symbol(self.symbol_count, dtype, shape, trainable=False)
-        self.params.put(constant_id, cst)
-        self.symbol_count += 1
-        return constant_id
+        return self.create_symbol(shape, Param(data), trainable=False)
 
     fn out(inout self, symbol: Symbol):
         self.outputs.append(symbol)
@@ -77,14 +77,15 @@ struct Graph:
         attributes: AttributeVector = AttributeVector(),
     ) -> Symbol:
         var res_shape = static_result_shape(op, operands, attributes)
-        var res = Symbol(
-            self.symbol_count, dtype, res_shape, self.result_trainable(operands)
-        )
+        var res = Symbol(self.symbol_count, dtype, res_shape, self.result_trainable(operands))
         self.symbol_count += 1
 
         var inputs = List[Symbol]()
+        inputs.reserve(len(operands))
+
         for operand in operands:
             inputs.append(operand)
+
         self.nodes.append(Node(op, inputs, List[Symbol](res), attributes))
         return res
 
@@ -95,8 +96,7 @@ struct Graph:
         operand_2: Float64,
         attributes: AttributeVector = AttributeVector(),
     ) -> Symbol:
-        var operand_2_symbol = self.scalar(operand_2)
-        return self.op(op, operand_1, operand_2_symbol, attributes=attributes)
+        return self.op(op, operand_1, self.scalar(operand_2), attributes=attributes)
 
     fn op(
         inout self,
@@ -105,43 +105,43 @@ struct Graph:
         operand_2: Symbol,
         attributes: AttributeVector = AttributeVector(),
     ) -> Symbol:
-        var operand_1_symbol = self.scalar(operand_1)
-        return self.op(op, operand_1_symbol, operand_2, attributes=attributes)
+        return self.op(op, self.scalar(operand_1), operand_2, attributes=attributes)
+
+    fn create_symbols(inout self, shapes: List[TensorShape], trainable: Bool = False) -> List[Symbol]:
+        var symbols = List[Symbol]()
+        symbols.reserve(len(shapes))
+
+        for shape in shapes:
+            symbols.append(Symbol(self.symbol_count, dtype, shape[], trainable))
+            self.symbol_count += 1
+
+        return symbols
+
+    fn add_node(inout self, op: OP, inputs: List[Symbol], outputs: List[Symbol], attributes: AttributeVector):
+        self.nodes.append(Node(op, inputs, outputs, attributes))
 
-    # Dynamic ops
     fn concat(inout self, *operands: Symbol, dim: Int = 0) -> Symbol:
-        # NOTE: Concat could fit into g.op() given a different static_result_shape is called
         var attributes = AttributeVector(Attribute("dim", dim))
-
         var res_shape = dynamic_result_shape(OP.CONCAT, operands, attributes)[0]
-        var res = Symbol(
-            self.symbol_count, dtype, res_shape, self.result_trainable(operands)
-        )
-        self.symbol_count += 1
+        var res_symbols = self.create_symbols(List[TensorShape](res_shape), self.result_trainable(operands))
 
-        var inputs = List[Symbol]()
+        var operand_list = List[Symbol]()
+        operand_list.reserve(len(operands))
         for operand in operands:
-            inputs.append(operand)
-        self.nodes.append(Node(OP.CONCAT, inputs, List[Symbol](res), attributes))
-        return res
+            operand_list.append(operand)
+
+        self.add_node(OP.CONCAT, operand_list, res_symbols, attributes)
+        return res_symbols[0]
 
     fn split(
         inout self, operand: Symbol, sections: List[Int], dim: Int = 0
     ) -> List[Symbol]:
-        var attributes = AttributeVector(
-            Attribute("sections", TensorShape(sections)), Attribute("dim", dim)
-        )
+        var attributes = AttributeVector(Attribute("sections", TensorShape(sections)), Attribute("dim", dim))
         var res_shapes = dynamic_result_shape(OP.SPLIT, operand, attributes)
         var trainable = self.result_trainable(operand)
-
-        var results = List[Symbol]()
-        for i in range(len(res_shapes)):
-            var symbol = Symbol(self.symbol_count, dtype, res_shapes[i], trainable)
-            results.append(symbol)
-            self.symbol_count += 1
-
-        self.nodes.append(Node(OP.SPLIT, List[Symbol](operand), results, attributes))
-        return results
+        var result_symbols = self.create_symbols(res_shapes, trainable)
+        self.add_node(OP.SPLIT, List[Symbol](operand), result_symbols, attributes)
+        return result_symbols
 
     @staticmethod
     fn result_trainable(operands: VariadicList[Symbol]) -> Bool:

diff --git a/basalt/autograd/ops/basics.mojo b/basalt/autograd/ops/basics.mojo
@@ -1,11 +1,15 @@
-from math import add, sub, mul, div, log, exp
+from math import log, exp
 from algorithm import vectorize
 from memory import memcpy
+from utils.numerics import isinf
 
 from basalt import Tensor, TensorShape
 from basalt.nn.tensor import MAX_RANK
 from basalt.utils.tensorutils import *
 from basalt.autograd.attributes import Attribute, AttributeVector
+from basalt.autograd.ops.matmul import dot, dot_transpose_t1, dot_transpose_t2
+from basalt.utils.math_util import add, sub, mul, div
+
 
 """
 Implement forward and backward operations for basic tensor manipulations.
@@ -315,28 +319,33 @@ struct POW:
         # d(x^y) / dx = y * x^(y-1)
         # d(x^y) / dy = sum( x^y * log(x) )
         var res_grad: Tensor[dtype]
-        var a = int(t2[0])
+        var a = t2[0]
+
+        alias epsilon = 1e-12
 
         @parameter
         if tensor_id == 0:
             res_grad = Tensor[dtype](t1_shape)
 
             @parameter
             fn vec_pow_bw_x[nelts: Int](i: Int):
-                res_grad.store[nelts](
-                    i, a * (t1.load[nelts](i) ** (a - 1)) * ug.load[nelts](i)
-                )
+                res_grad.store[nelts](i, a * ((t1.load[nelts](i) + epsilon) ** (a - 1)) * ug.load[nelts](i))
 
             vectorize[vec_pow_bw_x, nelts](t1_shape.num_elements())
 
         else:
+            # Gradient of the exponent
             res_grad = Tensor[dtype](t2_shape)  # t2_shape == TensorShape(1)
 
             @parameter
             fn vec_pow_bw_y[nelts: Int](i: Int):
+                # the case when the value passed to log is 0.0
+                var temp_log = log(t1.load[nelts](i))
+                var temp_log_is_inf = isinf(temp_log)
+                temp_log = temp_log_is_inf.select(0, temp_log)
                 res_grad[0] += (
                     (t1.load[nelts](i) ** a)
-                    * log(t1.load[nelts](i))
+                    * temp_log
                     * ug.load[nelts](i)
                 ).reduce_add()
 

diff --git a/basalt/autograd/ops/conv.mojo b/basalt/autograd/ops/conv.mojo
@@ -1,9 +1,7 @@
 from basalt import Tensor, TensorShape
 from basalt.autograd.attributes import AttributeVector
-from basalt.utils.tensorutils import dot, dot_transpose_t1, dot_transpose_t2
 
 from algorithm import parallelize, vectorize, tile
-from math import divmod
 from utils.loop import unroll
 
 

diff --git a/basalt/autograd/ops/dynamics.mojo b/basalt/autograd/ops/dynamics.mojo
@@ -33,7 +33,7 @@ struct CONCAT:
     fn forward[attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ):
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         var n_chunks = Self.calc_chunks(inputs[0].shape, dim)
@@ -58,7 +58,7 @@ struct CONCAT:
     fn backward[input_id: Int, attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ) -> Tensor[dtype]:
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         var n_chunks = Self.calc_chunks(inputs[0].shape, dim)
@@ -113,7 +113,7 @@ struct SPLIT:
     fn forward[attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ):
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         alias sections = attributes["sections"].value().to_shape()
@@ -139,7 +139,7 @@ struct SPLIT:
     fn backward[input_id: Int, attributes: AttributeVector](
         inputs: List[Symbol],
         outputs: List[Symbol],
-        parameters: Parameters,
+        inout parameters: Parameters,
     ) -> Tensor[dtype]:
         alias dim = attributes["dim"].value().to_int() if attributes["dim"] else 0
         alias sections = attributes["sections"].value().to_shape()