diff --git a/Cargo.toml b/Cargo.toml
index 9c161cfe8..f1bad8cef 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,10 +33,11 @@ matrixmultiply = { version = "0.3.2", default-features = false, optional = true
 zip = { version = "0.6.2", default-features = false, optional = true }
 cblas-sys = { version = "0.1.4", default-features = false, optional = true }
 libc = { version = "0.2", default-features = false, optional = true }
-cudarc = { version = "0.9.7", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
+cudarc = { git = "https://github.com/coreylowman/cudarc", branch = "dfdx-half", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc", "f16"] }
 num-traits = { version = "0.2.15", default-features = false }
 safetensors = { version = "0.3", default-features = false, optional = true }
 memmap2 = { version = "0.5", default-features = false, optional = true }
+half = { git = "https://github.com/starkat99/half-rs.git", branch = "main", optional = true, features = ["num-traits", "rand_distr"] }
 
 [dev-dependencies]
 tempfile = "3.3.0"
@@ -48,7 +49,7 @@ glob = { version = "0.3.1", optional = true }
 
 [features]
 default = ["std", "fast-alloc", "cpu-par-matmul"]
-nightly = []
+nightly = ["half?/use-intrinsics"]
 
 std = ["cudarc?/std", "matrixmultiply?/std", "rand_distr/std_math"]
 fast-alloc = ["std"]
@@ -61,9 +62,12 @@ cpu-mkl-matmul = ["dep:cblas-sys", "dep:libc"]
 cuda = ["dep:cudarc", "dep:glob"]
 cudnn = ["cuda", "cudarc?/cudnn"]
 
+f16 = ["dep:half"]
+
 numpy = ["dep:zip", "std"]
 safetensors = ["dep:safetensors", "std", "dep:memmap2"]
 
+test-f16 = ["f16"]
 test-f64 = []
 test-integrations = []
 ci-check = ["cudarc?/ci-check"]
diff --git a/build.rs b/build.rs
index 3860f5fb0..826c3e9ce 100644
--- a/build.rs
+++ b/build.rs
@@ -4,6 +4,9 @@ fn main() {
     // If on nightly, enable "nightly" feature
     maybe_enable_nightly();
 
+    #[cfg(feature = "cuda")]
+    cuda::set_include_dir();
+
     #[cfg(feature = "cuda")]
     cuda::build_ptx();
 
@@ -25,6 +28,52 @@ fn maybe_enable_nightly() {
 
 #[cfg(feature = "cuda")]
 mod cuda {
+    pub fn set_include_dir() {
+        // NOTE: copied from cudarc build.rs.
+        // We can't actually set a env!() value from another crate,
+        // so we have to do that here.
+
+        use std::path::PathBuf;
+
+        let env_vars = [
+            "CUDA_PATH",
+            "CUDA_ROOT",
+            "CUDA_TOOLKIT_ROOT_DIR",
+            "CUDNN_LIB",
+        ];
+        #[allow(unused)]
+        let env_vars = env_vars
+            .into_iter()
+            .map(std::env::var)
+            .filter_map(Result::ok)
+            .map(Into::<PathBuf>::into);
+
+        let roots = [
+            "/usr",
+            "/usr/local/cuda",
+            "/opt/cuda",
+            "/usr/lib/cuda",
+            "C:/Program Files/NVIDIA GPU Computing Toolkit",
+            "C:/CUDA",
+        ];
+        #[allow(unused)]
+        let roots = roots.into_iter().map(Into::<PathBuf>::into);
+
+        #[cfg(feature = "ci-check")]
+        let root: PathBuf = "ci".into();
+
+        #[cfg(not(feature = "ci-check"))]
+        let root = env_vars
+            .chain(roots)
+            .find(|path| path.join("include").join("cuda.h").is_file())
+            .unwrap();
+
+        println!(
+            "cargo:rustc-env=CUDA_INCLUDE_DIR={}",
+            root.join("include").display()
+        );
+    }
+
     pub fn build_ptx() {
         let out_dir = std::env::var("OUT_DIR").unwrap();
         let kernel_paths: Vec<std::path::PathBuf> = glob::glob("src/**/*.cu")
@@ -38,6 +87,10 @@ mod cuda {
 
         for path in &mut include_directories {
             println!("cargo:rerun-if-changed={}", path.display());
+            let destination =
+                std::format!("{out_dir}/{}", path.file_name().unwrap().to_str().unwrap());
+            println!("cargo:rerun-if-changed={}", destination);
+            std::fs::copy(path.clone(), destination).unwrap();
             // remove the filename from the path so it's just the directory
             path.pop();
         }
@@ -130,6 +183,8 @@ mod cuda {
                         .args(["--output-directory", &out_dir])
                         .args(&include_options)
                         .arg(p)
+                        .stdout(std::process::Stdio::piped())
+                        .stderr(std::process::Stdio::piped())
                         .spawn()
                         .unwrap()
                 })
@@ -139,7 +194,9 @@ mod cuda {
                 let output = child.wait_with_output().unwrap();
                 assert!(
                     output.status.success(),
-                    "nvcc error while compiling {kernel_path:?}: {output:?}",
+                    "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
+                    String::from_utf8_lossy(&output.stdout),
+                    String::from_utf8_lossy(&output.stderr)
                 );
             }
 
diff --git a/src/lib.rs b/src/lib.rs
index 0aef06d65..60915a9d6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -241,6 +241,7 @@ pub fn keep_denormals() {
 
 #[cfg(test)]
 pub(crate) mod tests {
+    pub use num_traits::{Float, FromPrimitive, NumCast, Zero};
 
     #[cfg(not(feature = "cuda"))]
     pub type TestDevice = crate::tensor::Cpu;
@@ -248,9 +249,15 @@ pub(crate) mod tests {
     #[cfg(feature = "cuda")]
     pub type TestDevice = crate::tensor::Cuda;
 
-    #[cfg(not(feature = "test-f64"))]
+    #[cfg(all(feature = "test-f64", feature = "test-f16"))]
+    compile_error!("f64 and f16 cannot be tested at the same time");
+
+    #[cfg(all(not(feature = "test-f16"), not(feature = "test-f64")))]
     pub type TestDtype = f32;
 
+    #[cfg(feature = "test-f16")]
+    pub type TestDtype = half::f16;
+
     #[cfg(feature = "test-f64")]
     pub type TestDtype = f64;
 
@@ -275,6 +282,19 @@ pub(crate) mod tests {
         }
     }
 
+    #[cfg(feature = "f16")]
+    impl AssertClose for half::f16 {
+        type Elem = Self;
+        const DEFAULT_TOLERANCE: Self::Elem = half::f16::from_f32_const(1e-2);
+        fn get_far_pair(&self, rhs: &Self, tolerance: Self) -> Option<(Self, Self)> {
+            if num_traits::Float::abs(self - rhs) > tolerance {
+                Some((*self, *rhs))
+            } else {
+                None
+            }
+        }
+    }
+
     impl AssertClose for f32 {
         type Elem = f32;
         const DEFAULT_TOLERANCE: Self::Elem = 1e-6;
@@ -349,12 +369,9 @@ pub(crate) mod tests {
     macro_rules! assert_close_to_literal {
         ($Lhs:expr, $Rhs:expr) => {{
             let lhs = $Lhs.array();
+            let rhs = $Rhs.ndmap(|x| num_traits::FromPrimitive::from_f64(x).unwrap());
             let tol = AssertClose::get_default_tol(&lhs);
-            let far_pair = AssertClose::get_far_pair(
-                &lhs,
-                &$Rhs.ndmap(|x| num_traits::FromPrimitive::from_f64(x).unwrap()),
-                tol,
-            );
+            let far_pair = AssertClose::get_far_pair(&lhs, &rhs, tol);
             if let Some((l, r)) = far_pair {
                 panic!("lhs != rhs | {l} != {r}");
             }
@@ -411,5 +428,6 @@ pub(crate) mod tests {
             }
         }};
     }
+
     pub(crate) use assert_close;
 }
diff --git a/src/losses.rs b/src/losses.rs
index e09ef67fc..5da12f2a3 100644
--- a/src/losses.rs
+++ b/src/losses.rs
@@ -47,7 +47,7 @@ pub fn mae_loss<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>>(
 pub fn huber_loss<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>>(
     pred: Tensor<S, E, D, T>,
     targ: Tensor<S, E, D>,
-    delta: impl Into<E>,
+    delta: impl Into<f64>,
 ) -> Tensor<Rank0, E, D, T> {
     pred.huber_error(targ, delta).mean()
 }
@@ -62,10 +62,10 @@ pub fn huber_loss<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>>(
 pub fn smooth_l1_loss<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>>(
     pred: Tensor<S, E, D, T>,
     targ: Tensor<S, E, D>,
-    delta: impl Into<E>,
+    delta: impl Into<f64>,
 ) -> Tensor<Rank0, E, D, T> {
-    let delta = delta.into();
-    huber_loss(pred, targ, delta) / delta
+    let delta: f64 = delta.into();
+    huber_loss(pred, targ, delta) / E::from_f64(delta).unwrap()
 }
 
 /// [Cross entropy loss](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_loss_function_and_logistic_regression).
@@ -132,10 +132,12 @@ mod tests {
     #[test]
     fn test_mse() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> =
-            dev.tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]);
-        let y: Tensor<_, TestDtype, _> =
-            dev.tensor([-0.90954804, -1.0193185, -0.39221755, 2.2524886, 1.3035554]);
+        let x = dev
+            .tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048])
+            .to_dtype::<TestDtype>();
+        let y = dev
+            .tensor([-0.90954804, -1.0193185, -0.39221755, 2.2524886, 1.3035554])
+            .to_dtype::<TestDtype>();
         let loss = mse_loss(x.leaky_trace(), y);
         assert_close_to_literal!(loss, 1.0846305);
         let g = loss.backward();
@@ -148,10 +150,12 @@ mod tests {
     #[test]
     fn test_mae() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> =
-            dev.tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]);
-        let y: Tensor<_, TestDtype, _> =
-            dev.tensor([-0.90954804, -1.0193186, -0.39221755, 2.2524886, 1.3035554]);
+        let x = dev
+            .tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048])
+            .to_dtype::<TestDtype>();
+        let y = dev
+            .tensor([-0.90954804, -1.0193186, -0.39221755, 2.2524886, 1.3035554])
+            .to_dtype::<TestDtype>();
         let loss = mae_loss(x.leaky_trace(), y);
         assert_close_to_literal!(loss, 0.9042107);
         let g = loss.backward();
@@ -161,14 +165,18 @@ mod tests {
     #[test]
     fn test_soft_cross_entropy() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.01322946, 0.7367754, -0.8874471, 0.6997109, 0.98312855],
-            [-0.19822043, 1.192167, -0.7495395, -1.5733303, -1.4898887],
-        ]);
-        let y: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.3180433, 0.15164024, 0.2352255, 0.08821669, 0.20687431],
-            [0.15627657, 0.29779273, 0.10897867, 0.2879545, 0.14899758],
-        ]);
+        let x = dev
+            .tensor([
+                [0.01322946, 0.7367754, -0.8874471, 0.6997109, 0.98312855],
+                [-0.19822043, 1.192167, -0.7495395, -1.5733303, -1.4898887],
+            ])
+            .to_dtype::<TestDtype>();
+        let y = dev
+            .tensor([
+                [0.3180433, 0.15164024, 0.2352255, 0.08821669, 0.20687431],
+                [0.15627657, 0.29779273, 0.10897867, 0.2879545, 0.14899758],
+            ])
+            .to_dtype::<TestDtype>();
         let loss = cross_entropy_with_logits_loss(x.leaky_trace(), y.clone());
         assert_close_to_literal!(loss, 1.9889611);
         let g = loss.backward();
@@ -191,13 +199,14 @@ mod tests {
     #[test]
     fn test_hard_crossentropy() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> =
-            dev.tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]);
+        let x = dev
+            .tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048])
+            .to_dtype::<TestDtype>();
         let losses = [1.5655229, 2.680529, 3.444099, 1.2829198, 0.883499];
         for i in 0..5 {
             let mut targ = [0.0; 5];
             targ[i] = 1.0;
-            let y = dev.tensor(targ);
+            let y = dev.tensor(targ).to_dtype::<TestDtype>();
             let loss = cross_entropy_with_logits_loss(x.leaky_trace(), y.clone());
             assert_close_to_literal!(loss, losses[i]);
         }
@@ -206,20 +215,24 @@ mod tests {
     #[test]
     fn test_kl_div() {
         let dev: TestDevice = Default::default();
-        let logits: Tensor<_, TestDtype, _> = dev.tensor([
-            [-0.2354, 0.4408, 0.9688],
-            [-0.2187, -0.3451, -1.5473],
-            [0.7420, 0.7186, 1.0785],
-            [-1.2231, 0.2536, 0.3489],
-            [-0.9163, -0.2289, 0.2576],
-        ]);
-        let targ: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.3178, 0.5344, 0.1479],
-            [0.1915, 0.6178, 0.1907],
-            [0.4834, 0.1789, 0.3377],
-            [0.5809, 0.3623, 0.0568],
-            [0.0166, 0.8512, 0.1322],
-        ]);
+        let logits = dev
+            .tensor([
+                [-0.2354, 0.4408, 0.9688],
+                [-0.2187, -0.3451, -1.5473],
+                [0.7420, 0.7186, 1.0785],
+                [-1.2231, 0.2536, 0.3489],
+                [-0.9163, -0.2289, 0.2576],
+            ])
+            .to_dtype::<TestDtype>();
+        let targ = dev
+            .tensor([
+                [0.3178, 0.5344, 0.1479],
+                [0.1915, 0.6178, 0.1907],
+                [0.4834, 0.1789, 0.3377],
+                [0.5809, 0.3623, 0.0568],
+                [0.0166, 0.8512, 0.1322],
+            ])
+            .to_dtype::<TestDtype>();
         let loss = kl_div_with_logits_loss(logits.leaky_trace(), targ);
         assert_close_to_literal!(loss, 0.40656143);
         let g = loss.backward();
@@ -238,16 +251,20 @@ mod tests {
     #[test]
     fn test_bce() {
         let dev: TestDevice = Default::default();
-        let logit: Tensor<_, TestDtype, _> = dev.tensor([
-            [-0.4092005, -0.6706018, 0.9201696],
-            [-1.6583557, 1.6978683, -1.4827578],
-            [-0.9571696, -1.0971526, 0.8801755],
-        ]);
-        let prob: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.365251, 0.8322099, 0.482717],
-            [0.168392, 0.7987092, 0.1177533],
-            [0.7026833, 0.5563793, 0.6429267],
-        ]);
+        let logit = dev
+            .tensor([
+                [-0.4092005, -0.6706018, 0.9201696],
+                [-1.6583557, 1.6978683, -1.4827578],
+                [-0.9571696, -1.0971526, 0.8801755],
+            ])
+            .to_dtype::<TestDtype>();
+        let prob = dev
+            .tensor([
+                [0.365251, 0.8322099, 0.482717],
+                [0.168392, 0.7987092, 0.1177533],
+                [0.7026833, 0.5563793, 0.6429267],
+            ])
+            .to_dtype::<TestDtype>();
         let loss = binary_cross_entropy_with_logits_loss(logit.leaky_trace(), prob.clone());
         assert_close_to_literal!(loss, 0.7045728);
 
@@ -275,9 +292,10 @@ mod tests {
     #[test]
     fn test_bce_wide_range() {
         let dev: TestDevice = Default::default();
-        let logit: Tensor<_, TestDtype, _> =
-            dev.tensor([[100.0; 3], [-100.0; 3], [-1.0, 0.0, 1.0]]);
-        let targ: Tensor<_, TestDtype, _> = dev.tensor([[0.0, 0.5, 1.0]; 3]);
+        let logit = dev
+            .tensor([[100.0; 3], [-100.0; 3], [-1.0, 0.0, 1.0]])
+            .to_dtype::<TestDtype>();
+        let targ = dev.tensor([[0.0, 0.5, 1.0]; 3]).to_dtype::<TestDtype>();
 
         let loss = binary_cross_entropy_with_logits_loss(logit.leaky_trace(), targ.clone());
         assert_close_to_literal!(loss, 33.479964);
@@ -306,16 +324,20 @@ mod tests {
     #[test]
     fn test_huber_loss() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([
-            [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708],
-            [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786],
-            [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133],
-        ]);
-        let y: Tensor<_, TestDtype, _> = dev.tensor([
-            [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708],
-            [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786],
-            [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133],
-        ]);
+        let x = dev
+            .tensor([
+                [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708],
+                [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786],
+                [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133],
+            ])
+            .to_dtype::<TestDtype>();
+        let y = dev
+            .tensor([
+                [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708],
+                [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786],
+                [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133],
+            ])
+            .to_dtype::<TestDtype>();
 
         let loss = huber_loss(x.leaky_trace(), y.clone(), 0.5);
         assert_close_to_literal!(loss, 0.24506615);
@@ -342,16 +364,20 @@ mod tests {
     #[test]
     fn test_smooth_l1_loss() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([
-            [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708],
-            [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786],
-            [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133],
-        ]);
-        let y: Tensor<_, TestDtype, _> = dev.tensor([
-            [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708],
-            [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786],
-            [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133],
-        ]);
+        let x = dev
+            .tensor([
+                [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708],
+                [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786],
+                [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133],
+            ])
+            .to_dtype::<TestDtype>();
+        let y = dev
+            .tensor([
+                [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708],
+                [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786],
+                [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133],
+            ])
+            .to_dtype::<TestDtype>();
 
         let loss = smooth_l1_loss(x.leaky_trace(), y.clone(), 0.5);
         assert_close_to_literal!(loss, 0.4901323);
diff --git a/src/nn/batchnorm1d.rs b/src/nn/batchnorm1d.rs
index f72c9a591..e1d0674ab 100644
--- a/src/nn/batchnorm1d.rs
+++ b/src/nn/batchnorm1d.rs
@@ -1,5 +1,4 @@
 use crate::{shapes::*, tensor::*, tensor_ops::*};
-use num_traits::FromPrimitive;
 
 use super::{
     batchnorm2d::{infer_fwd, train_fwd},
@@ -66,11 +65,11 @@ pub struct BatchNorm1D<const C: usize, E: Dtype, D: DeviceStorage> {
     /// Spatial variance that is updated during training. Defaults to 1.0
     pub running_var: Tensor<Rank1<C>, E, D>,
     /// Added to variance before taking sqrt for numerical stability. Defaults to 1e-5
-    pub epsilon: E,
+    pub epsilon: f64,
     /// Controls exponential moving average of running stats.Defaults to 0.1
     ///
     /// `running_stat * (1.0 - momentum) + stat * momentum`.
-    pub momentum: E,
+    pub momentum: f64,
 }
 
 impl<const C: usize, E: Dtype, D: Device<E>> BatchNorm1D<C, E, D> {
@@ -206,8 +205,8 @@ impl<const C: usize, E: Dtype, D: Device<E>> TensorCollection<E, D> for BatchNor
                 bias,
                 running_mean,
                 running_var,
-                epsilon: V::E2::from_f32(1e-5).unwrap(),
-                momentum: V::E2::from_f32(0.1).unwrap(),
+                epsilon: 1e-5,
+                momentum: 0.1,
             },
         )
     }
diff --git a/src/nn/batchnorm2d.rs b/src/nn/batchnorm2d.rs
index 120d5b04e..996ad11a7 100644
--- a/src/nn/batchnorm2d.rs
+++ b/src/nn/batchnorm2d.rs
@@ -25,20 +25,20 @@ pub fn train_fwd<const C: usize, S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>
     mean: &mut Tensor<Rank1<C>, E, D>,
     scale: &Tensor<Rank1<C>, E, D>,
     bias: &Tensor<Rank1<C>, E, D>,
-    epsilon: E,
-    momentum: E,
+    epsilon: f64,
+    momentum: f64,
 ) -> Result<Tensor<S, E, D, T>, D::Err>
 where
     S: HasAxes<Ax> + ReduceShapeTo<Rank1<C>, Ax>,
 {
-    let n = E::from_usize(<S as HasAxes<Ax>>::size(x.shape())).unwrap();
+    let n = f64::from_usize(<S as HasAxes<Ax>>::size(x.shape())).unwrap();
     let shape = *x.shape();
 
     // compute statistics for updating running stats later - on tape
     let mean_chan = x.retaped::<T>().try_mean::<Rank1<C>, _>()?;
 
     // update statistics since we are training - off tape
-    mean.try_axpy(E::ONE - momentum, &mean_chan, momentum)?;
+    mean.try_axpy(1.0 - momentum, &mean_chan, momentum)?;
 
     let centered = x.try_sub(mean_chan.try_broadcast_like(&shape)?)?;
 
@@ -48,10 +48,12 @@ where
         .try_mean::<Rank1<C>, _>()?;
 
     // NOTE: uses unbiased variance in running estimate
-    var.try_axpy(E::ONE - momentum, &var_chan, momentum * n / (n - E::ONE))?;
+    var.try_axpy(1.0 - momentum, &var_chan, momentum * n / (n - 1.0))?;
 
     // statistics for normalizing - on tape
-    let std = var_chan.try_add(epsilon)?.try_sqrt()?;
+    let std = var_chan
+        .try_add(E::from_f64(epsilon).unwrap())?
+        .try_sqrt()?;
 
     // record broadcast of scale & bias - on tape
     let scale = scale
@@ -71,7 +73,7 @@ pub fn infer_fwd<const C: usize, S: Shape, E: Dtype, D: Device<E>, Ax: Axes>(
     mean: &Tensor<Rank1<C>, E, D>,
     scale: &Tensor<Rank1<C>, E, D>,
     bias: &Tensor<Rank1<C>, E, D>,
-    epsilon: E,
+    epsilon: f64,
 ) -> Result<Tensor<S, E, D>, D::Err>
 where
     Rank1<C>: BroadcastShapeTo<S, Ax>,
@@ -79,7 +81,10 @@ where
     let shape = *x.shape();
 
     // statistics for normalizing
-    let std = var.clone().try_add(epsilon)?.try_sqrt()?;
+    let std = var
+        .clone()
+        .try_add(E::from_f64(epsilon).unwrap())?
+        .try_sqrt()?;
 
     let scale = scale.clone().try_div(std)?.try_broadcast_like(&shape)?;
 
@@ -134,11 +139,11 @@ pub struct BatchNorm2D<const C: usize, E: Dtype, D: DeviceStorage> {
     /// Spatial variance that is updated during training. Defaults to 1.0
     pub running_var: Tensor<Rank1<C>, E, D>,
     /// Added to variance before taking sqrt for numerical stability. Defaults to 1e-5
-    pub epsilon: E,
+    pub epsilon: f64,
     /// Controls exponential moving average of running stats.Defaults to 0.1
     ///
     /// `running_stat * (1.0 - momentum) + stat * momentum`.
-    pub momentum: E,
+    pub momentum: f64,
 }
 
 impl<const C: usize, E: Dtype, D: Device<E>> BatchNorm2D<C, E, D> {
@@ -273,8 +278,8 @@ impl<const C: usize, E: Dtype, D: Device<E>> TensorCollection<E, D> for BatchNor
                 bias,
                 running_mean,
                 running_var,
-                epsilon: V::E2::from_f32(1e-5).unwrap(),
-                momentum: V::E2::from_f32(0.1).unwrap(),
+                epsilon: 1e-5,
+                momentum: 0.1,
             },
         )
     }
diff --git a/src/nn/conv.rs b/src/nn/conv.rs
index 942ad80e9..07214ec30 100644
--- a/src/nn/conv.rs
+++ b/src/nn/conv.rs
@@ -185,7 +185,10 @@ mod tests {
         let out = m.forward(dev.sample_normal::<Rank4<8, 2, 28, 28>>().leaky_trace());
         let g = out.square().mean().backward();
 
-        assert_ne!(g.get(&m.weight).array(), [[[[0.0; 3]; 3]; 2]; 4]);
+        assert_ne!(
+            g.get(&m.weight).array(),
+            [[[[TestDtype::zero(); 3]; 3]; 2]; 4]
+        );
 
         opt.update(&mut m, &g).expect("unused params");
 
diff --git a/src/nn/convtrans.rs b/src/nn/convtrans.rs
index 12f708e79..f338f2ebb 100644
--- a/src/nn/convtrans.rs
+++ b/src/nn/convtrans.rs
@@ -185,7 +185,10 @@ mod tests {
         let out = m.forward(dev.sample_normal::<Rank4<8, 2, 28, 28>>().leaky_trace());
         let g = out.square().mean().backward();
 
-        assert_ne!(g.get(&m.weight).array(), [[[[0.0; 3]; 3]; 2]; 4]);
+        assert_ne!(
+            g.get(&m.weight).array(),
+            [[[[TestDtype::zero(); 3]; 3]; 2]; 4]
+        );
 
         opt.update(&mut m, &g).expect("unused params");
 
diff --git a/src/nn/dropout.rs b/src/nn/dropout.rs
index 87aa1a07f..0ed7d5183 100644
--- a/src/nn/dropout.rs
+++ b/src/nn/dropout.rs
@@ -74,7 +74,7 @@ impl<const N: usize, S: Shape, E: Dtype, D: Device<E>> ModuleMut<Tensor<S, E, D,
         &mut self,
         input: Tensor<S, E, D, OwnedTape<E, D>>,
     ) -> Result<Self::Output, D::Err> {
-        input.try_dropout(E::ONE / E::from_usize(N).unwrap())
+        input.try_dropout(1.0 / N as f64)
     }
 }
 
@@ -150,7 +150,7 @@ impl<S: Shape, E: Dtype, D: Device<E>> ModuleMut<Tensor<S, E, D, OwnedTape<E, D>
         &mut self,
         input: Tensor<S, E, D, OwnedTape<E, D>>,
     ) -> Result<Self::Output, D::Err> {
-        input.try_dropout(E::from_f32(self.p).unwrap())
+        input.try_dropout(self.p)
     }
 }
 
diff --git a/src/nn/ema.rs b/src/nn/ema.rs
index 5b2d15f6d..c5643e6fb 100644
--- a/src/nn/ema.rs
+++ b/src/nn/ema.rs
@@ -2,10 +2,10 @@ use super::tensor_collection::*;
 
 use crate::{shapes::*, tensor::*, tensor_ops::Device};
 
-struct ModelEMAOp<E> {
-    decay: E,
+struct ModelEMAOp {
+    decay: f64,
 }
-impl<E: Dtype, D: Device<E>> TensorVisitor<E, D> for ModelEMAOp<E> {
+impl<E: Dtype, D: Device<E>> TensorVisitor<E, D> for ModelEMAOp {
     type Viewer = (ViewTensorMut, ViewTensorRef);
     type Err = D::Err;
     type E2 = E;
@@ -17,7 +17,7 @@ impl<E: Dtype, D: Device<E>> TensorVisitor<E, D> for ModelEMAOp<E> {
         (dst, src): (&mut Tensor<S, E, D>, &Tensor<S, E, D>),
     ) -> Result<Option<Tensor<S, E, D>>, Self::Err> {
         if opts.do_gradient_update {
-            dst.try_axpy(self.decay, src, E::ONE - self.decay)?;
+            dst.try_axpy(self.decay, src, 1.0 - self.decay)?;
         }
         Ok(None)
     }
@@ -42,11 +42,12 @@ pub trait ModelEMA<E: Dtype, D: Device<E>>: TensorCollection<E, D> {
     ///
     /// **Only updates trainable parameters**. For example, batch normalization
     /// running parameters are not updated.
-    fn ema(&mut self, other: &Self, decay: E) {
+    fn ema(&mut self, other: &Self, decay: impl Into<f64>) {
         self.try_ema(other, decay).unwrap();
     }
 
-    fn try_ema(&mut self, other: &Self, decay: E) -> Result<(), D::Err> {
+    fn try_ema(&mut self, other: &Self, decay: impl Into<f64>) -> Result<(), D::Err> {
+        let decay = decay.into();
         let mut op = ModelEMAOp { decay };
         Self::iter_tensors(&mut RecursiveWalker {
             m: (self, other),
@@ -75,7 +76,7 @@ mod tests {
         ema1.1 .1.running_var.fill_with_distr(distr);
         let ema0 = ema1.clone();
 
-        let decay: TestDtype = 0.5;
+        let decay = 0.5;
 
         ema1.ema(&model, decay);
         // check that batchnorm running params aren't updated
diff --git a/src/nn/embedding.rs b/src/nn/embedding.rs
index 0a109b95b..ec576dac9 100644
--- a/src/nn/embedding.rs
+++ b/src/nn/embedding.rs
@@ -122,7 +122,7 @@ mod tests {
     use super::*;
     use crate::tests::*;
 
-    const W: [[TestDtype; 5]; 2] = [
+    const W: [[f64; 5]; 2] = [
         [-0.3458893, -0.30371523, -0.3712057, 0.14303583, -0.0268966],
         [0.11733949, 0.14059687, -0.10670426, -0.09373143, 0.18974298],
     ];
@@ -131,9 +131,9 @@ mod tests {
     fn test_embedding_initialize() {
         let dev: TestDevice = Default::default();
         let m = dev.build_module::<builder::Embedding<2000, 1>, TestDtype>();
-        let bound = 1.0 / (2000.0.sqrt());
+        let bound: TestDtype = NumCast::from(1.0 / (2000.0.sqrt())).unwrap();
         for v in m.weight.as_vec() {
-            assert!(-bound <= v && v <= bound && v != 0.0);
+            assert!(-bound <= v && v <= bound && v != TestDtype::zero());
         }
     }
 
@@ -143,7 +143,8 @@ mod tests {
 
         let model = Embedding {
             weight: dev.tensor(W),
-        };
+        }
+        .to_dtype::<TestDtype>();
 
         let x = dev.tensor([0, 0, 1]);
         let y = model.forward(x.leaky_trace());
diff --git a/src/nn/generalized_residual.rs b/src/nn/generalized_residual.rs
index 0acc3789a..9000c5f11 100644
--- a/src/nn/generalized_residual.rs
+++ b/src/nn/generalized_residual.rs
@@ -106,9 +106,10 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         type Model = GeneralizedResidual<Linear<2, 2>, Linear<2, 2>>;
-        let model = dev.build_module::<Model, f32>();
+        let model = dev.build_module::<Model, f32>().to_dtype::<TestDtype>();
 
-        let x = dev.sample_normal::<Rank2<4, 2>>();
+        let x: Tensor<Rank2<4, 2>, f32, _> = dev.sample_normal();
+        let x = x.to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
 
         #[rustfmt::skip]
diff --git a/src/nn/impl_module_for_tuples.rs b/src/nn/impl_module_for_tuples.rs
index c267b7566..1dd2cbdd6 100644
--- a/src/nn/impl_module_for_tuples.rs
+++ b/src/nn/impl_module_for_tuples.rs
@@ -108,7 +108,7 @@ mod tests {
     fn test_2_tuple_update() {
         let dev: TestDevice = Default::default();
         type Model = (Linear<2, 3>, Linear<3, 4>);
-        let mut model = Model::build_on_device(&dev);
+        let mut model = dev.build_module::<Model, f32>();
         assert_ne!(model.0.weight.array(), [[0.0; 2]; 3]);
         assert_ne!(model.0.bias.array(), [0.0; 3]);
         assert_ne!(model.1.weight.array(), [[0.0; 3]; 4]);
diff --git a/src/nn/layer_norm.rs b/src/nn/layer_norm.rs
index f7559a7a3..3c2f25581 100644
--- a/src/nn/layer_norm.rs
+++ b/src/nn/layer_norm.rs
@@ -1,5 +1,4 @@
 use crate::{shapes::*, tensor::*, tensor_ops::*};
-use num_traits::FromPrimitive;
 
 use super::*;
 
@@ -40,7 +39,7 @@ where
 pub struct LayerNorm1D<const M: usize, E: Dtype, D: DeviceStorage> {
     pub gamma: Tensor<Rank1<M>, E, D>,
     pub beta: Tensor<Rank1<M>, E, D>,
-    pub epsilon: E,
+    pub epsilon: f64,
 }
 
 impl<const M: usize, E: Dtype, D: DeviceStorage> NonMutableModule for LayerNorm1D<M, E, D> {}
@@ -69,7 +68,7 @@ impl<const M: usize, E: Dtype, D: Device<E>> TensorCollection<E, D> for LayerNor
             |(gamma, beta)| LayerNorm1D {
                 gamma,
                 beta,
-                epsilon: V::E2::from_f32(1e-5).unwrap(),
+                epsilon: 1e-5,
             },
         )
     }
@@ -126,19 +125,19 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         let mut m = dev.build_module::<builder::LayerNorm1D<5>, TestDtype>();
-        assert_eq!(m.gamma.array(), [1.0; 5]);
-        assert_eq!(m.beta.array(), [0.0; 5]);
+        assert_close_to_literal!(m.gamma, [1.0; 5]);
+        assert_close_to_literal!(m.beta, [0.0; 5]);
 
         m.gamma = dev.sample_normal();
         m.beta = dev.sample_normal();
 
-        assert_ne!(m.gamma.array(), [1.0; 5]);
-        assert_ne!(m.beta.array(), [0.0; 5]);
+        assert_ne!(m.gamma.array(), [TestDtype::ONE; 5]);
+        assert_ne!(m.beta.array(), [TestDtype::default(); 5]);
 
         m.reset_params();
 
-        assert_eq!(m.gamma.array(), [1.0; 5]);
-        assert_eq!(m.beta.array(), [0.0; 5]);
+        assert_close_to_literal!(m.gamma, [1.0; 5]);
+        assert_close_to_literal!(m.beta, [0.0; 5]);
     }
 
     #[test]
diff --git a/src/nn/linear.rs b/src/nn/linear.rs
index 3e951ccd8..c76a30a29 100644
--- a/src/nn/linear.rs
+++ b/src/nn/linear.rs
@@ -154,11 +154,11 @@ mod tests {
     use super::*;
     use crate::tests::*;
 
-    const W: [[TestDtype; 5]; 2] = [
+    const W: [[f64; 5]; 2] = [
         [-0.3458893, -0.30371523, -0.3712057, 0.14303583, -0.0268966],
         [0.11733949, 0.14059687, -0.10670426, -0.09373143, 0.18974298],
     ];
-    const B: [TestDtype; 2] = [0.3765365, -0.290717];
+    const B: [f64; 2] = [0.3765365, -0.290717];
 
     #[test]
     fn test_linear_ondevice() {
@@ -173,13 +173,12 @@ mod tests {
     fn test_linear_initialize() {
         let dev: TestDevice = Default::default();
         let m = dev.build_module::<builder::Linear<2000, 1>, TestDtype>();
-        let bound: TestDtype = 1.0 / 2000.0;
-        let bound = bound.sqrt();
+        let bound: TestDtype = NumCast::from((1.0 / 2000.0f64).sqrt()).unwrap();
         for v in m.weight.as_vec() {
-            assert!(-bound <= v && v <= bound && v != 0.0);
+            assert!(-bound <= v && v <= bound);
         }
         for v in m.bias.as_vec() {
-            assert!(-bound <= v && v <= bound && v != 0.0);
+            assert!(-bound <= v && v <= bound);
         }
     }
 
@@ -190,9 +189,12 @@ mod tests {
         let model = Linear {
             weight: dev.tensor(W),
             bias: dev.tensor(B),
-        };
+        }
+        .to_dtype::<TestDtype>();
 
-        let x = dev.tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299]);
+        let x = dev
+            .tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299])
+            .to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
         assert_close_to_literal!(y, [-0.93430865, 0.08624211]);
 
@@ -214,13 +216,16 @@ mod tests {
         let model = Linear {
             weight: dev.tensor(W),
             bias: dev.tensor(B),
-        };
-
-        let x = dev.tensor([
-            [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643],
-            [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705],
-            [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455],
-        ]);
+        }
+        .to_dtype::<TestDtype>();
+
+        let x = dev
+            .tensor([
+                [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643],
+                [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705],
+                [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455],
+            ])
+            .to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
         assert_close_to_literal!(
             y,
@@ -249,13 +254,14 @@ mod tests {
         let model = Linear {
             weight: dev.tensor(W),
             bias: dev.tensor(B),
-        };
+        }
+        .to_dtype::<TestDtype>();
 
         #[rustfmt::skip]
         let x = dev.tensor([
             [[-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455]],
             [[1.2879219, 0.70150787, -1.6746868, 1.7261779, -0.94021803], [-2.6883178, 2.9369607, 2.9256766, 0.27559614, -0.17530347], [0.17499207, -0.11440835, 0.16627812, -0.91773695, 1.1128315]],
-        ]);
+        ]).to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
         assert_close_to_literal!(
             y,
diff --git a/src/nn/repeated.rs b/src/nn/repeated.rs
index dc8a0acb5..63cec4529 100644
--- a/src/nn/repeated.rs
+++ b/src/nn/repeated.rs
@@ -97,8 +97,11 @@ mod tests {
         let m = dev.build_module::<Model, TestDtype>();
 
         for i in 0..5 {
-            assert_ne!(m.modules[i].0.weight.array(), [[0.0; 3]; 3]);
-            assert_ne!(m.modules[i].0.bias.array(), [0.0; 3]);
+            assert_ne!(
+                m.modules[i].0.weight.array(),
+                [[TestDtype::default(); 3]; 3]
+            );
+            assert_ne!(m.modules[i].0.bias.array(), [TestDtype::default(); 3]);
         }
     }
 
diff --git a/src/nn/residual.rs b/src/nn/residual.rs
index 40c6f3ac2..fbf7c6c78 100644
--- a/src/nn/residual.rs
+++ b/src/nn/residual.rs
@@ -71,17 +71,20 @@ mod tests {
     fn test_residual_reset() {
         let dev: TestDevice = Default::default();
         let model = dev.build_module::<Residual<Linear<2, 5>>, TestDtype>();
-        assert_ne!(model.0.weight.array(), [[0.0; 2]; 5]);
-        assert_ne!(model.0.bias.array(), [0.0; 5]);
+        assert_ne!(model.0.weight.array(), [[TestDtype::default(); 2]; 5]);
+        assert_ne!(model.0.bias.array(), [TestDtype::default(); 5]);
     }
 
     #[test]
     fn test_residual_gradients() {
         let dev: TestDevice = Default::default();
 
-        let model = <Residual<Linear<2, 2>>>::build_on_device(&dev);
+        let model = dev
+            .build_module::<Residual<Linear<2, 2>>, f32>()
+            .to_dtype::<TestDtype>();
 
-        let x: Tensor<Rank2<4, 2>, f32, TestDevice> = dev.sample_normal();
+        let x: Tensor<Rank2<4, 2>, f32, _> = dev.sample_normal();
+        let x = x.to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
 
         #[rustfmt::skip]
diff --git a/src/nn/split_into.rs b/src/nn/split_into.rs
index fa8343f48..b3d7a2370 100644
--- a/src/nn/split_into.rs
+++ b/src/nn/split_into.rs
@@ -109,8 +109,8 @@ mod tests {
         let gr = right.mean().backward();
         let l = left.retaped::<NoneTape>();
         let gl = left.mean().backward();
-        assert_ne!(gl.get(&l).array(), [0.0; 1]);
-        assert_ne!(gr.get(&r).array(), [0.0; 1]);
+        assert_ne!(gl.get(&l).array(), [TestDtype::zero(); 1]);
+        assert_ne!(gr.get(&r).array(), [TestDtype::zero(); 1]);
     }
 
     #[test]
diff --git a/src/nn/unbiased_linear.rs b/src/nn/unbiased_linear.rs
index 239ea5799..d94acf047 100644
--- a/src/nn/unbiased_linear.rs
+++ b/src/nn/unbiased_linear.rs
@@ -96,7 +96,7 @@ mod tests {
     use super::*;
     use crate::tests::*;
 
-    const W: [[TestDtype; 5]; 2] = [
+    const W: [[f64; 5]; 2] = [
         [-0.3458893, -0.30371523, -0.3712057, 0.14303583, -0.0268966],
         [0.11733949, 0.14059687, -0.10670426, -0.09373143, 0.18974298],
     ];
@@ -116,10 +116,9 @@ mod tests {
     fn test_unbiased_linear_initialize() {
         let dev: TestDevice = Default::default();
         let m = dev.build_module::<builder::UnbiasedLinear<2000, 1>, TestDtype>();
-        let bound: TestDtype = 1.0 / 2000.0;
-        let bound = bound.sqrt();
+        let bound: TestDtype = NumCast::from((1.0 / 2000.0).sqrt()).unwrap();
         for v in m.weight.as_vec() {
-            assert!(-bound <= v && v <= bound && v != 0.0);
+            assert!(-bound <= v && v <= bound && v != TestDtype::zero());
         }
     }
 
@@ -129,9 +128,12 @@ mod tests {
 
         let model = UnbiasedLinear {
             weight: dev.tensor(W),
-        };
+        }
+        .to_dtype::<TestDtype>();
 
-        let x = dev.tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299]);
+        let x = dev
+            .tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299])
+            .to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
         assert_close_to_literal!(y, [-1.3108451, 0.37695912]);
 
@@ -151,13 +153,16 @@ mod tests {
 
         let model = UnbiasedLinear {
             weight: dev.tensor(W),
-        };
-
-        let x = dev.tensor([
-            [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643],
-            [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705],
-            [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455],
-        ]);
+        }
+        .to_dtype::<TestDtype>();
+
+        let x = dev
+            .tensor([
+                [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643],
+                [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705],
+                [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455],
+            ])
+            .to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
         assert_close_to_literal!(
             y,
@@ -190,13 +195,14 @@ mod tests {
 
         let model = UnbiasedLinear {
             weight: dev.tensor(W),
-        };
+        }
+        .to_dtype::<TestDtype>();
 
         #[rustfmt::skip]
         let x = dev.tensor([
             [[-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455]],
             [[1.2879219, 0.70150787, -1.6746868, 1.7261779, -0.94021803], [-2.6883178, 2.9369607, 2.9256766, 0.27559614, -0.17530347], [0.17499207, -0.11440835, 0.16627812, -0.91773695, 1.1128315]],
-        ]);
+        ]).to_dtype::<TestDtype>();
         let y = model.forward(x.leaky_trace());
         assert_close_to_literal!(
             y,
diff --git a/src/nn/zero_grads.rs b/src/nn/zero_grads.rs
index f278b5aa1..891361562 100644
--- a/src/nn/zero_grads.rs
+++ b/src/nn/zero_grads.rs
@@ -98,10 +98,13 @@ mod tests {
         grads.get_or_alloc_mut(&tmp2).unwrap();
 
         model.zero_grads(&mut grads);
-        assert_eq!(grads.get(&model.0.weight).array(), [[0.0; 2]; 5]);
-        assert_eq!(grads.get(&model.0.bias).array(), [0.0; 5]);
-        assert_eq!(grads.get(&model.1.scale).array(), [0.0; 3]);
-        assert_eq!(grads.get(&model.1.bias).array(), [0.0; 3]);
+        assert_eq!(
+            grads.get(&model.0.weight).array(),
+            [[TestDtype::zero(); 2]; 5]
+        );
+        assert_eq!(grads.get(&model.0.bias).array(), [TestDtype::zero(); 5]);
+        assert_eq!(grads.get(&model.1.scale).array(), [TestDtype::zero(); 3]);
+        assert_eq!(grads.get(&model.1.bias).array(), [TestDtype::zero(); 3]);
         assert!(grads.get_ref_checked(&model.1.running_mean).is_none());
         assert!(grads.get_ref_checked(&model.1.running_var).is_none());
         assert!(grads.get_ref_checked(&tmp1).is_none());
diff --git a/src/optim/adam/adam.cu b/src/optim/adam/adam.cu
index 1a908d7a5..3b1dcf9e0 100644
--- a/src/optim/adam/adam.cu
+++ b/src/optim/adam/adam.cu
@@ -6,21 +6,20 @@ enum WeightDecayType {
     Decoupled
 };
 
-template<typename T>
 struct AdamConfig {
-    T lr;
-    T beta1;
-    T beta2;
-    T eps;
+    double lr;
+    double beta1;
+    double beta2;
+    double eps;
     WeightDecayType weight_decay_type;
-    T weight_decay;
+    double weight_decay;
 };
 
 template<typename T>
 __device__ void adam_update(
-    const AdamConfig<T> cfg,
+    const AdamConfig cfg,
     const size_t numel,
-    const T t,
+    const int t_int,
     T* param,
     T* moment1,
     T* moment2,
@@ -32,23 +31,31 @@ __device__ void adam_update(
         return;
     }
 
+    T beta1 = cfg.beta1;
+    T beta2 = cfg.beta2;
+    T lr = cfg.lr;
+    T weight_decay = cfg.weight_decay;
+    T eps = cfg.eps;
+
     T p = param[i];
     T g = grad[i];
     T m = moment1[i];
     T v = moment2[i];
+    T one = 1.0;
+    T t = t_int;
 
     if (cfg.weight_decay_type == L2) {
-        g += cfg.weight_decay * p;
+        g += weight_decay * p;
     }
 
-    m = m * cfg.beta1 + g * (1.0 - cfg.beta1);
-    v = v * cfg.beta2 + g * g * (1.0 - cfg.beta2);
-    T m_hat = m * 1.0 / (1.0 - powg(cfg.beta1, t));
-    T v_hat = v * 1.0 / (1.0 - powg(cfg.beta2, t));
-    g = cfg.lr * m_hat / (sqrtg(v_hat) + cfg.eps);
+    m = m * beta1 + g * (one - beta1);
+    v = v * beta2 + g * g * (one - beta2);
+    T m_hat = m * one / (one - powg(beta1, t));
+    T v_hat = v * one / (one - powg(beta2, t));
+    g = lr * m_hat / (sqrtg(v_hat) + eps);
 
     if (cfg.weight_decay_type == Decoupled) {
-        g += cfg.weight_decay * cfg.lr * p;
+        g += (weight_decay * lr) * p;
     }
 
     moment1[i] = m;
@@ -58,9 +65,9 @@ __device__ void adam_update(
 
 #define ADAM(TYPENAME, FN) \
 extern "C" __global__ void FN( \
-    const AdamConfig<TYPENAME> cfg, \
+    const AdamConfig cfg, \
     const size_t numel, \
-    const TYPENAME t, \
+    const int t, \
     TYPENAME* param, \
     TYPENAME* moment1, \
     TYPENAME* moment2, \
@@ -69,5 +76,6 @@ extern "C" __global__ void FN( \
     adam_update(cfg, numel, t, param, moment1, moment2, grad); \
 }
 
+ADAM(__half, adam_update_f16);
 ADAM(float, adam_update_f32);
 ADAM(double, adam_update_f64);
diff --git a/src/optim/adam/cpu_kernel.rs b/src/optim/adam/cpu_kernel.rs
index 32b59bd98..42632b479 100644
--- a/src/optim/adam/cpu_kernel.rs
+++ b/src/optim/adam/cpu_kernel.rs
@@ -5,29 +5,33 @@ impl<E: num_traits::Float + Dtype> AdamKernel<E> for Cpu {
     fn update(
         &self,
         t: i32,
-        cfg: &AdamConfig<E>,
+        cfg: &AdamConfig,
         param: &mut Self::Vec<E>,
         moment1: &mut Self::Vec<E>,
         moment2: &mut Self::Vec<E>,
         grad: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
+        let betas = cfg.betas.map(E::from_f64).map(Option::unwrap);
+        let eps = E::from_f64(cfg.eps).unwrap();
+        let lr = E::from_f64(cfg.lr).unwrap();
+
         for ((p, mut g), (m, v)) in param
             .iter_mut()
             .zip(grad.iter().cloned())
             .zip(moment1.iter_mut().zip(moment2.iter_mut()))
         {
             if let Some(WeightDecay::L2(wd)) = cfg.weight_decay {
-                g += wd * *p;
+                g += E::from_f64(wd).unwrap() * *p;
             }
 
-            *m = *m * cfg.betas[0] + g * (E::one() - cfg.betas[0]);
-            *v = *v * cfg.betas[1] + g.powi(2) * (E::one() - cfg.betas[1]);
-            let m_hat = *m * (E::one() - cfg.betas[0].powi(t)).recip();
-            let v_hat = *v * (E::one() - cfg.betas[1].powi(t)).recip();
-            g = cfg.lr * m_hat / (v_hat.sqrt() + cfg.eps);
+            *m = *m * betas[0] + g * (E::one() - betas[0]);
+            *v = *v * betas[1] + g.powi(2) * (E::one() - betas[1]);
+            let m_hat = *m * (E::one() - betas[0].powi(t)).recip();
+            let v_hat = *v * (E::one() - betas[1].powi(t)).recip();
+            g = lr * m_hat / (v_hat.sqrt() + eps);
 
             if let Some(WeightDecay::Decoupled(wd)) = cfg.weight_decay {
-                g += wd * cfg.lr * *p;
+                g += E::from_f64(wd * cfg.lr).unwrap() * *p;
             }
 
             *p -= g;
diff --git a/src/optim/adam/cuda_kernel.rs b/src/optim/adam/cuda_kernel.rs
index ef1c9b132..ac06c2e13 100644
--- a/src/optim/adam/cuda_kernel.rs
+++ b/src/optim/adam/cuda_kernel.rs
@@ -7,18 +7,18 @@ use crate::{
 use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync};
 
 #[repr(C)]
-struct CudaAdamConfig<E> {
-    lr: E,
-    beta1: E,
-    beta2: E,
-    eps: E,
+struct CudaAdamConfig {
+    lr: f64,
+    beta1: f64,
+    beta2: f64,
+    eps: f64,
     weight_decay_type: WeightDecayType,
-    weight_decay: E,
+    weight_decay: f64,
 }
 
-unsafe impl<E: DeviceRepr> DeviceRepr for CudaAdamConfig<E> {}
+unsafe impl DeviceRepr for CudaAdamConfig {}
 
-fn adam_config_to_cuda<E: Default + Copy>(config: &super::AdamConfig<E>) -> CudaAdamConfig<E> {
+fn adam_config_to_cuda(config: &super::AdamConfig) -> CudaAdamConfig {
     let (weight_decay_type, weight_decay) = weight_decay_to_cuda(config.weight_decay);
 
     CudaAdamConfig {
@@ -38,6 +38,12 @@ trait HasCudaKernel<E> {
     const FWD: &'static str;
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "adam_f16";
+    const FWD: &'static str = "adam_update_f16";
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "adam_f32";
     const FWD: &'static str = "adam_update_f32";
@@ -55,7 +61,7 @@ where
     fn update(
         &self,
         t: i32,
-        cfg: &super::AdamConfig<E>,
+        cfg: &super::AdamConfig,
         param: &mut Self::Vec<E>,
         moment1: &mut Self::Vec<E>,
         moment2: &mut Self::Vec<E>,
@@ -69,7 +75,6 @@ where
         let numel = param.len();
         let func = self.dev.get_func(Self::MOD, Self::FWD).unwrap();
         let cfg = launch_cfg::<128>(numel as u32);
-        let t = <E>::from_i32(t).unwrap();
         let params = (opt_cfg, numel, t, param, moment1, moment2, grad);
         unsafe { func.launch(cfg, params) }?;
         Ok(())
diff --git a/src/optim/adam/mod.rs b/src/optim/adam/mod.rs
index b8f80f3a7..2f0ed5d32 100644
--- a/src/optim/adam/mod.rs
+++ b/src/optim/adam/mod.rs
@@ -27,26 +27,26 @@ use super::{Optimizer, OptimizerUpdateError, UnusedTensors, WeightDecay};
 /// };
 /// ```
 #[derive(Debug, Clone, Copy)]
-pub struct AdamConfig<E> {
+pub struct AdamConfig {
     /// Learning rate. Defaults to `1e-3`.
-    pub lr: E,
+    pub lr: f64,
 
     /// Betas from Adam paper. Defaults to `[0.9, 0.999]`.
-    pub betas: [E; 2],
+    pub betas: [f64; 2],
 
     /// Epsilon for numerical stability. Defaults to `1e-8`.
-    pub eps: E,
+    pub eps: f64,
 
     /// Optional weight decay. Defaults to `None`.
-    pub weight_decay: Option<WeightDecay<E>>,
+    pub weight_decay: Option<WeightDecay>,
 }
 
-impl<E: Dtype> Default for AdamConfig<E> {
+impl Default for AdamConfig {
     fn default() -> Self {
         Self {
-            lr: E::from_f32(1e-3).unwrap(),
-            betas: [E::from_f32(0.9).unwrap(), E::from_f32(0.999).unwrap()],
-            eps: E::from_f32(1e-8).unwrap(),
+            lr: 1e-3,
+            betas: [0.9, 0.999],
+            eps: 1e-8,
             weight_decay: None,
         }
     }
@@ -73,7 +73,7 @@ impl<E: Dtype> Default for AdamConfig<E> {
 #[derive(Debug)]
 pub struct Adam<M, E: Dtype, D: DeviceStorage> {
     /// Hyperparameter configuration
-    pub cfg: AdamConfig<E>,
+    pub cfg: AdamConfig,
 
     t: i32,
     moment1: Gradients<E, D>,
@@ -84,7 +84,7 @@ pub struct Adam<M, E: Dtype, D: DeviceStorage> {
 
 impl<M, E: Dtype, D: DeviceStorage> Adam<M, E, D> {
     /// Constructs using hyperparameters from `cfg`.
-    pub fn new(_model: &M, cfg: AdamConfig<E>) -> Self {
+    pub fn new(_model: &M, cfg: AdamConfig) -> Self {
         Self {
             cfg,
             t: 0,
@@ -99,7 +99,7 @@ pub trait AdamKernel<E: Dtype>: DeviceStorage {
     fn update(
         &self,
         t: i32,
-        cfg: &AdamConfig<E>,
+        cfg: &AdamConfig,
         param: &mut Self::Vec<E>,
         moment1: &mut Self::Vec<E>,
         moment2: &mut Self::Vec<E>,
@@ -173,7 +173,9 @@ mod tests {
         let dev: TestDevice = Default::default();
         let mut t: Tensor<Rank1<5>, TestDtype, _> = dev.ones();
         let mut opt = Adam::new(&t, Default::default());
-        let rate = dev.tensor([1e-6, 1e-5, 1e-4, 1e-3, 1e-2]);
+        let rate = dev
+            .tensor([1e-6, 1e-5, 1e-4, 1e-3, 1e-2])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.99999994, 0.999996, 0.9997143, 0.9990244, 0.99900025],
             [0.9999999, 0.999992, 0.99942863, 0.99804884, 0.9980005],
@@ -207,7 +209,9 @@ mod tests {
                 weight_decay: None,
             },
         );
-        let rate = dev.tensor([1e-4, 1e-3, 1e-2, 1e-1, 1e-0]);
+        let rate = dev
+            .tensor([1e-4, 1e-3, 1e-2, 1e-1, 1e-0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9997143, 0.9990244, 0.99900025, 0.999, 0.999],
             [0.99942863, 0.99804866, 0.9980004, 0.9979999, 0.9979999],
@@ -231,7 +235,9 @@ mod tests {
     #[test]
     fn test_adam_l2_decay() {
         let dev: TestDevice = Default::default();
-        let mut t: Tensor<Rank1<5>, TestDtype, _> = dev.tensor([-0.5, -0.25, 0.1, 0.6, 1.0]);
+        let mut t = dev
+            .tensor([-0.5, -0.25, 0.1, 0.6, 1.0])
+            .to_dtype::<TestDtype>();
         let mut opt = Adam::new(
             &t,
             AdamConfig {
@@ -264,7 +270,9 @@ mod tests {
     #[test]
     fn test_adam_decoupled_decay() {
         let dev: TestDevice = Default::default();
-        let mut t: Tensor<Rank1<5>, TestDtype, _> = dev.tensor([-0.5, -0.25, 0.1, 0.6, 1.0]);
+        let mut t = dev
+            .tensor([-0.5, -0.25, 0.1, 0.6, 1.0])
+            .to_dtype::<TestDtype>();
         let mut opt = Adam::new(
             &t,
             AdamConfig {
diff --git a/src/optim/optimizer.rs b/src/optim/optimizer.rs
index 5e41a7336..e37b630b1 100644
--- a/src/optim/optimizer.rs
+++ b/src/optim/optimizer.rs
@@ -4,14 +4,14 @@ use crate::{
 };
 
 /// L2 and decoupled regularization methods
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum WeightDecay<E> {
+#[derive(Debug, Clone, Copy)]
+pub enum WeightDecay {
     /// Weight decay applied to the gradients before any momentum updates. Equivalent to L2 regularization.
-    L2(E),
+    L2(f64),
 
     /// Weight decay applied after any momentum updates, without modifying the gradients.
     /// See [Decoupled Weight Decay Regularization](https://arxiv.org/abs/1711.05101)
-    Decoupled(E),
+    Decoupled(f64),
 }
 
 /// Used to communicate the "WeightDecay" enum to cuda kernels
@@ -25,7 +25,7 @@ pub(super) enum WeightDecayType {
 }
 
 #[cfg(feature = "cuda")]
-pub(super) fn weight_decay_to_cuda<E: Default>(wd: Option<WeightDecay<E>>) -> (WeightDecayType, E) {
+pub(super) fn weight_decay_to_cuda(wd: Option<WeightDecay>) -> (WeightDecayType, f64) {
     match wd {
         None => (WeightDecayType::None, Default::default()),
         Some(WeightDecay::L2(x)) => (WeightDecayType::L2, x),
@@ -34,13 +34,13 @@ pub(super) fn weight_decay_to_cuda<E: Default>(wd: Option<WeightDecay<E>>) -> (W
 }
 
 /// Momentum used for [super::Sgd] and others
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum Momentum<E> {
+#[derive(Debug, Clone, Copy)]
+pub enum Momentum {
     /// Momentum that is applied to the velocity of a parameter directly.
-    Classic(E),
+    Classic(f64),
 
     /// Momentum that is applied to both velocity and gradients. See [super::Sgd] nesterov paper for more.
-    Nesterov(E),
+    Nesterov(f64),
 }
 
 /// Used to communicate the "Momentum" enum to cuda kernels
@@ -54,7 +54,7 @@ pub(super) enum MomentumType {
 }
 
 #[cfg(feature = "cuda")]
-pub(super) fn momentum_to_cuda<E: Default>(wd: Option<Momentum<E>>) -> (MomentumType, E) {
+pub(super) fn momentum_to_cuda(wd: Option<Momentum>) -> (MomentumType, f64) {
     match wd {
         None => (MomentumType::None, Default::default()),
         Some(Momentum::Classic(x)) => (MomentumType::Classic, x),
diff --git a/src/optim/rmsprop/cpu_kernel.rs b/src/optim/rmsprop/cpu_kernel.rs
index 82bff2ded..4ebf61c21 100644
--- a/src/optim/rmsprop/cpu_kernel.rs
+++ b/src/optim/rmsprop/cpu_kernel.rs
@@ -5,47 +5,50 @@ use super::{RMSpropConfig, RMSpropKernel};
 impl<E: num_traits::Float + Dtype> RMSpropKernel<E> for Cpu {
     fn update(
         &self,
-        cfg: &RMSpropConfig<E>,
+        cfg: &RMSpropConfig,
         param: &mut Self::Vec<E>,
         momentum: &mut Self::Vec<E>,
         square_avg: &mut Self::Vec<E>,
         grad_avg: &mut Self::Vec<E>,
         grad: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
+        let alpha = E::from_f64(cfg.alpha).unwrap();
+        let eps = E::from_f64(cfg.eps).unwrap();
+        let lr = E::from_f64(cfg.lr).unwrap();
         for ((p, mut g), (s_avg, (g_avg, m))) in param.iter_mut().zip(grad.iter().cloned()).zip(
             square_avg
                 .iter_mut()
                 .zip(grad_avg.iter_mut().zip(momentum.iter_mut())),
         ) {
             if let Some(WeightDecay::L2(wd)) = cfg.weight_decay {
-                g += wd * *p;
+                g += E::from_f64(wd).unwrap() * *p;
             }
 
             // sa = a * sa + (1 - a) * g^2
-            *s_avg += (E::one() - cfg.alpha) * (g * g - *s_avg);
+            *s_avg += (E::one() - alpha) * (g * g - *s_avg);
 
             let avg = if cfg.centered {
                 // ga = a * ga + (1 - a) * g
-                *g_avg += (E::one() - cfg.alpha) * (g - *g_avg);
-                // NOTE: cfg.eps in sqrt
-                (*s_avg - g_avg.powi(2) + cfg.eps).sqrt()
+                *g_avg += (E::one() - alpha) * (g - *g_avg);
+                // NOTE: eps in sqrt
+                (*s_avg - g_avg.powi(2) + eps).sqrt()
             } else {
-                // NOTE: cfg.eps in sqrt
-                (*s_avg + cfg.eps).sqrt()
+                // NOTE: eps in sqrt
+                (*s_avg + eps).sqrt()
             };
 
             g /= avg;
 
             match cfg.momentum {
                 Some(u) => {
-                    *m = *m * u + g;
-                    g = *m * cfg.lr;
+                    *m = *m * E::from_f64(u).unwrap() + g;
+                    g = *m * lr;
                 }
-                None => g *= cfg.lr,
+                None => g *= lr,
             }
 
             if let Some(WeightDecay::Decoupled(wd)) = cfg.weight_decay {
-                g += wd * cfg.lr * *p;
+                g += E::from_f64(wd * cfg.lr).unwrap() * *p;
             }
 
             *p -= g;
diff --git a/src/optim/rmsprop/cuda_kernel.rs b/src/optim/rmsprop/cuda_kernel.rs
index a87430d24..c5a577dae 100644
--- a/src/optim/rmsprop/cuda_kernel.rs
+++ b/src/optim/rmsprop/cuda_kernel.rs
@@ -8,20 +8,20 @@ use crate::{
 use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync};
 
 #[repr(C)]
-struct CudaRMSpropConfig<E> {
-    lr: E,
-    alpha: E,
-    eps: E,
+struct CudaRMSpropConfig {
+    lr: f64,
+    alpha: f64,
+    eps: f64,
     centered: bool,
     has_momentum: bool,
-    momentum: E,
+    momentum: f64,
     weight_decay_type: WeightDecayType,
-    weight_decay: E,
+    weight_decay: f64,
 }
 
-unsafe impl<E: DeviceRepr> DeviceRepr for CudaRMSpropConfig<E> {}
+unsafe impl DeviceRepr for CudaRMSpropConfig {}
 
-fn rmsprop_config_to_cuda<E: Default + Copy>(config: &RMSpropConfig<E>) -> CudaRMSpropConfig<E> {
+fn rmsprop_config_to_cuda(config: &RMSpropConfig) -> CudaRMSpropConfig {
     let (weight_decay_type, weight_decay) = weight_decay_to_cuda(config.weight_decay);
     let (has_momentum, momentum) = if let Some(m) = config.momentum {
         (true, m)
@@ -48,6 +48,12 @@ trait HasCudaKernel<E> {
     const FWD: &'static str;
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "rmsprop_f16";
+    const FWD: &'static str = "rmsprop_update_f16";
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "rmsprop_f32";
     const FWD: &'static str = "rmsprop_update_f32";
@@ -64,7 +70,7 @@ where
 {
     fn update(
         &self,
-        cfg: &RMSpropConfig<E>,
+        cfg: &RMSpropConfig,
         param: &mut Self::Vec<E>,
         momentum: &mut Self::Vec<E>,
         square_avg: &mut Self::Vec<E>,
diff --git a/src/optim/rmsprop/mod.rs b/src/optim/rmsprop/mod.rs
index df435c7d9..3c170cd3f 100644
--- a/src/optim/rmsprop/mod.rs
+++ b/src/optim/rmsprop/mod.rs
@@ -16,33 +16,33 @@ use super::{Optimizer, OptimizerUpdateError, UnusedTensors, WeightDecay};
 
 /// Configuration of hyperparameters for [RMSprop].
 #[derive(Debug, Clone, Copy)]
-pub struct RMSpropConfig<E> {
+pub struct RMSpropConfig {
     /// Learning rate. Defaults to `1e-2`.
-    pub lr: E,
+    pub lr: f64,
 
     /// Value for exponential moving average. Defaults to `0.9`.
-    pub alpha: E,
+    pub alpha: f64,
 
     /// Epsilon for stability. Defaults to `1e-8`.
-    pub eps: E,
+    pub eps: f64,
 
     /// Optional momentum. Defaults to `None`.
-    pub momentum: Option<E>,
+    pub momentum: Option<f64>,
 
     /// Whether the avg should be centered by the grad's avg value.
     /// Defaults to `false`.
     pub centered: bool,
 
     /// Optional weight decay. Defaults to `None`.
-    pub weight_decay: Option<WeightDecay<E>>,
+    pub weight_decay: Option<WeightDecay>,
 }
 
-impl<E: Dtype> Default for RMSpropConfig<E> {
+impl Default for RMSpropConfig {
     fn default() -> Self {
         Self {
-            lr: E::from_f32(1e-2).unwrap(),
-            alpha: E::from_f32(0.9).unwrap(),
-            eps: E::from_f32(1e-8).unwrap(),
+            lr: 1e-2,
+            alpha: 0.9,
+            eps: 1e-8,
             momentum: None,
             centered: false,
             weight_decay: None,
@@ -80,7 +80,7 @@ impl<E: Dtype> Default for RMSpropConfig<E> {
 #[derive(Debug)]
 pub struct RMSprop<M, E: Dtype, D: DeviceStorage> {
     /// Hyperparameter configuration
-    pub cfg: RMSpropConfig<E>,
+    pub cfg: RMSpropConfig,
 
     step: usize,
     momentums: Gradients<E, D>,
@@ -92,7 +92,7 @@ pub struct RMSprop<M, E: Dtype, D: DeviceStorage> {
 
 impl<M, E: Dtype, D: DeviceStorage> RMSprop<M, E, D> {
     /// Constructs using hyperparameters from `cfg`.
-    pub fn new(_model: &M, cfg: RMSpropConfig<E>) -> Self {
+    pub fn new(_model: &M, cfg: RMSpropConfig) -> Self {
         Self {
             cfg,
             step: 0,
@@ -107,7 +107,7 @@ impl<M, E: Dtype, D: DeviceStorage> RMSprop<M, E, D> {
 pub trait RMSpropKernel<E: Dtype>: DeviceStorage {
     fn update(
         &self,
-        cfg: &RMSpropConfig<E>,
+        cfg: &RMSpropConfig,
         param: &mut Self::Vec<E>,
         momentum: &mut Self::Vec<E>,
         square_avg: &mut Self::Vec<E>,
@@ -186,9 +186,11 @@ mod tests {
     use super::*;
     use crate::{shapes::*, tensor_ops::*, tests::*};
 
-    fn test_matches_expected(cfg: RMSpropConfig<TestDtype>, expected: [[f64; 5]; 5]) {
+    fn test_matches_expected(cfg: RMSpropConfig, expected: [[f64; 5]; 5]) {
         let dev: TestDevice = Default::default();
-        let rate: Tensor<_, TestDtype, _> = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let mut t: Tensor<Rank1<5>, TestDtype, _> = dev.ones();
         let mut opt = RMSprop::new(&t, cfg);
         for e in expected.iter() {
diff --git a/src/optim/rmsprop/rmsprop.cu b/src/optim/rmsprop/rmsprop.cu
index 836121874..0beb5b4bb 100644
--- a/src/optim/rmsprop/rmsprop.cu
+++ b/src/optim/rmsprop/rmsprop.cu
@@ -6,21 +6,20 @@ enum WeightDecayType {
     Decoupled
 };
 
-template<typename T>
 struct RMSpropConfig {
-    T lr;
-    T alpha;
-    T eps;
+    double lr;
+    double alpha;
+    double eps;
     bool centered;
     bool has_momentum;
-    T momentum;
+    double momentum;
     WeightDecayType weight_decay_type;
-    T weight_decay;
+    double weight_decay;
 };
 
 template<typename T>
 __device__ void rmsprop_update(
-    const RMSpropConfig<T> cfg,
+    const RMSpropConfig cfg,
     const size_t numel,
     T* param,
     T* momentum,
@@ -34,39 +33,46 @@ __device__ void rmsprop_update(
         return;
     }
 
+    T lr = cfg.lr;
+    T alpha = cfg.alpha;
+    T eps = cfg.eps;
+    T momentum_ = cfg.momentum;
+    T weight_decay = cfg.weight_decay;
+
     T p = param[i];
     T g = grad[i];
     T s_avg = square_avg[i];
     T g_avg = grad_avg[i];
     T m = momentum[i];
+    T one = 1.0;
 
     if (cfg.weight_decay_type == L2) {
-        g += cfg.weight_decay * p;
+        g += weight_decay * p;
     }
 
-    s_avg += (1.0 - cfg.alpha) * (g * g - s_avg);
+    s_avg += (one - alpha) * (g * g - s_avg);
 
     T avg;
 
     if (cfg.centered) {
         // ga = a * ga + (1 - a) * g
-        g_avg += (1.0 - cfg.alpha) * (g - g_avg);
-        avg = sqrtg(s_avg - g_avg * g_avg + cfg.eps);
+        g_avg += (one - alpha) * (g - g_avg);
+        avg = sqrtg(s_avg - g_avg * g_avg + eps);
     } else {
-        avg = sqrtg(s_avg + cfg.eps);
+        avg = sqrtg(s_avg + eps);
     };
 
     g /= avg;
 
     if (cfg.has_momentum) {
-        m = m * cfg.momentum + g;
-        g = m * cfg.lr;
+        m = m * momentum_ + g;
+        g = m * lr;
     } else {
-        g *= cfg.lr;
+        g *= lr;
     }
 
     if (cfg.weight_decay_type == Decoupled) {
-        g += cfg.weight_decay * cfg.lr * p;
+        g += weight_decay * lr * p;
     }
 
     square_avg[i] = s_avg;
@@ -77,7 +83,7 @@ __device__ void rmsprop_update(
 
 #define RMSPROP(TYPENAME, FN) \
 extern "C" __global__ void FN( \
-    const RMSpropConfig<TYPENAME> cfg, \
+    const RMSpropConfig cfg, \
     const size_t numel, \
     TYPENAME* param, \
     TYPENAME* momentum, \
@@ -88,5 +94,6 @@ extern "C" __global__ void FN( \
     rmsprop_update(cfg, numel, param, momentum, square_avg, grad_avg, grad); \
 }
 
+RMSPROP(__half, rmsprop_update_f16);
 RMSPROP(float, rmsprop_update_f32);
 RMSPROP(double, rmsprop_update_f64);
diff --git a/src/optim/sgd/cpu_kernel.rs b/src/optim/sgd/cpu_kernel.rs
index cb1584f0c..a633211a1 100644
--- a/src/optim/sgd/cpu_kernel.rs
+++ b/src/optim/sgd/cpu_kernel.rs
@@ -9,34 +9,39 @@ use super::{SgdConfig, SgdKernel};
 impl<E: Dtype> SgdKernel<E> for Cpu {
     fn update(
         &self,
-        cfg: &SgdConfig<E>,
+        cfg: &SgdConfig,
         param: &mut Self::Vec<E>,
         velocity: &mut Self::Vec<E>,
         grad: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
+        let lr = E::from_f64(cfg.lr).unwrap();
+
         for ((p, mut g), v) in param
             .iter_mut()
             .zip(grad.iter().cloned())
             .zip(velocity.iter_mut())
         {
             if let Some(WeightDecay::L2(wd)) = cfg.weight_decay {
+                let wd = E::from_f64(wd).unwrap();
                 g += wd * *p;
             }
 
             match cfg.momentum {
                 Some(Momentum::Classic(u)) => {
+                    let u = E::from_f64(u).unwrap();
                     *v = g + u * *v;
-                    g = *v * cfg.lr;
+                    g = *v * lr;
                 }
                 Some(Momentum::Nesterov(u)) => {
+                    let u = E::from_f64(u).unwrap();
                     *v = g + u * *v;
-                    g = (g + u * *v) * cfg.lr;
+                    g = (g + u * *v) * lr;
                 }
-                None => g *= cfg.lr,
+                None => g *= lr,
             }
 
             if let Some(WeightDecay::Decoupled(wd)) = cfg.weight_decay {
-                g += wd * cfg.lr * *p;
+                g += E::from_f64(wd * cfg.lr).unwrap() * *p;
             }
 
             *p -= g;
diff --git a/src/optim/sgd/cuda_kernel.rs b/src/optim/sgd/cuda_kernel.rs
index 0e3d37097..0c0b3e12a 100644
--- a/src/optim/sgd/cuda_kernel.rs
+++ b/src/optim/sgd/cuda_kernel.rs
@@ -8,17 +8,17 @@ use crate::{
 use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync};
 
 #[repr(C)]
-struct CudaSgdConfig<E> {
-    lr: E,
+struct CudaSgdConfig {
+    lr: f64,
     momentum_type: MomentumType,
-    momentum: E,
+    momentum: f64,
     weight_decay_type: WeightDecayType,
-    weight_decay: E,
+    weight_decay: f64,
 }
 
-unsafe impl<E: DeviceRepr> DeviceRepr for CudaSgdConfig<E> {}
+unsafe impl DeviceRepr for CudaSgdConfig {}
 
-fn sgd_config_to_cuda<E: Default + Copy>(config: &SgdConfig<E>) -> CudaSgdConfig<E> {
+fn sgd_config_to_cuda(config: &SgdConfig) -> CudaSgdConfig {
     let (momentum_type, momentum) = momentum_to_cuda(config.momentum);
     let (weight_decay_type, weight_decay) = weight_decay_to_cuda(config.weight_decay);
 
@@ -38,6 +38,12 @@ trait HasCudaKernel<E> {
     const FWD: &'static str;
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "sgd_f16";
+    const FWD: &'static str = "sgd_update_f16";
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "sgd_f32";
     const FWD: &'static str = "sgd_update_f32";
@@ -54,7 +60,7 @@ where
 {
     fn update(
         &self,
-        cfg: &SgdConfig<E>,
+        cfg: &SgdConfig,
         param: &mut Self::Vec<E>,
         velocity: &mut Self::Vec<E>,
         grad: &Self::Vec<E>,
diff --git a/src/optim/sgd/mod.rs b/src/optim/sgd/mod.rs
index be945d4c5..248548d5a 100644
--- a/src/optim/sgd/mod.rs
+++ b/src/optim/sgd/mod.rs
@@ -66,21 +66,21 @@ use super::optimizer::*;
 /// };
 /// ```
 #[derive(Debug, Clone, Copy)]
-pub struct SgdConfig<E> {
+pub struct SgdConfig {
     /// Learning rate. Defaults to `1e-2`
-    pub lr: E,
+    pub lr: f64,
 
     /// Optional momentum. Defaults to `None`.
-    pub momentum: Option<Momentum<E>>,
+    pub momentum: Option<Momentum>,
 
     /// Optional weight decay. Defaults to `None`.
-    pub weight_decay: Option<WeightDecay<E>>,
+    pub weight_decay: Option<WeightDecay>,
 }
 
-impl<E: Dtype> Default for SgdConfig<E> {
+impl Default for SgdConfig {
     fn default() -> Self {
         Self {
-            lr: E::from_f32(1e-2).unwrap(),
+            lr: 1e-2,
             momentum: None,
             weight_decay: None,
         }
@@ -114,7 +114,7 @@ impl<E: Dtype> Default for SgdConfig<E> {
 #[derive(Debug)]
 pub struct Sgd<M, E: Dtype, D: DeviceStorage> {
     /// Hyperparameter configuration
-    pub cfg: SgdConfig<E>,
+    pub cfg: SgdConfig,
 
     velocity: Gradients<E, D>,
 
@@ -123,7 +123,7 @@ pub struct Sgd<M, E: Dtype, D: DeviceStorage> {
 
 impl<M, E: Dtype, D: DeviceStorage> Sgd<M, E, D> {
     /// Constructs using hyperparameters from `cfg`
-    pub fn new(_model: &M, cfg: SgdConfig<E>) -> Self {
+    pub fn new(_model: &M, cfg: SgdConfig) -> Self {
         Self {
             cfg,
             velocity: Gradients::leaky(),
@@ -135,7 +135,7 @@ impl<M, E: Dtype, D: DeviceStorage> Sgd<M, E, D> {
 pub trait SgdKernel<E: Dtype>: DeviceStorage {
     fn update(
         &self,
-        cfg: &SgdConfig<E>,
+        cfg: &SgdConfig,
         param: &mut Self::Vec<E>,
         velocity: &mut Self::Vec<E>,
         grad: &Self::Vec<E>,
@@ -228,7 +228,9 @@ mod tests {
         let mut t: Tensor<Rank1<5>, TestDtype, _> = dev.ones();
         let mut sgd = Sgd::new(&t, Default::default());
 
-        let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9998, 0.998, 0.996, 0.98, 0.8],
             [0.99960005, 0.99600005, 0.992, 0.96000004, 0.6],
@@ -258,7 +260,9 @@ mod tests {
             },
         );
 
-        let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9998, 0.998, 0.996, 0.98, 0.8],
             [0.99950004, 0.995, 0.99, 0.95000005, 0.5],
@@ -288,7 +292,9 @@ mod tests {
             },
         );
 
-        let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9997, 0.997, 0.994, 0.97, 0.70000005],
             [0.99935, 0.9935, 0.987, 0.935, 0.35000005],
@@ -327,7 +333,9 @@ mod tests {
             },
         );
 
-        let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9988, 0.997, 0.995, 0.979, 0.799],
             [0.99760115, 0.994003, 0.990005, 0.958021, 0.59820104],
@@ -362,7 +370,9 @@ mod tests {
             },
         );
 
-        let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9988, 0.997, 0.995, 0.979, 0.799],
             [0.9975012, 0.993003, 0.988005, 0.948021, 0.498201],
@@ -382,14 +392,14 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         // adding l2_weight_decay should be equivalent to adding an L2 term to the loss
-        let weight_decay = 1e-1;
+
         let mut t: Tensor<Rank1<5>, TestDtype, _> = dev.ones();
         let mut sgd_l2 = Sgd::new(
             &t,
             SgdConfig {
                 lr: 1e-2,
                 momentum: Some(Momentum::Classic(0.5)),
-                weight_decay: Some(WeightDecay::L2(weight_decay)),
+                weight_decay: Some(WeightDecay::L2(1e-1)),
             },
         );
         let mut sgd = Sgd::new(
@@ -401,7 +411,9 @@ mod tests {
             },
         );
 
-        let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]);
+        let rate = dev
+            .tensor([0.1, 1.0, 2.0, 10.0, 100.0])
+            .to_dtype::<TestDtype>();
         let expected = [
             [0.9988, 0.997, 0.995, 0.979, 0.799],
             [0.9970012, 0.992503, 0.987505, 0.947521, 0.49770102],
@@ -419,7 +431,8 @@ mod tests {
         t = dev.ones();
         for e in expected.iter() {
             let normal_loss = (t.leaky_trace() * rate.clone()).mean();
-            let l2_loss = t.leaky_trace().powi(2).sum() * (weight_decay / (2.0));
+            let scale: TestDtype = NumCast::from(1e-1 / 2.0).unwrap();
+            let l2_loss = t.leaky_trace().powi(2).sum() * scale;
             let loss = l2_loss + normal_loss;
 
             let gradients = loss.backward();
diff --git a/src/optim/sgd/sgd.cu b/src/optim/sgd/sgd.cu
index ce33c00b9..226930011 100644
--- a/src/optim/sgd/sgd.cu
+++ b/src/optim/sgd/sgd.cu
@@ -1,3 +1,5 @@
+#include "cuda_fp16.h"
+
 enum MomentumType {
     None,
     Classic,
@@ -10,18 +12,17 @@ enum WeightDecayType {
     Decoupled
 };
 
-template<typename T>
 struct SgdConfig {
-    T lr;
+    double lr;
     MomentumType momentum_type;
-    T momentum;
+    double momentum;
     WeightDecayType weight_decay_type;
-    T weight_decay;
+    double weight_decay;
 };
 
 template<typename T>
 __device__ void sgd_update(
-    const SgdConfig<T> cfg,
+    const SgdConfig cfg,
     const size_t numel,
     T* param,
     T* velocity,
@@ -33,26 +34,30 @@ __device__ void sgd_update(
         return;
     }
 
+    T weight_decay = cfg.weight_decay;
+    T lr = cfg.lr;
+    T momentum = cfg.momentum;
+
     T p = param[i];
     T g = grad[i];
     T v = velocity[i];
 
     if (cfg.weight_decay_type == L2) {
-        g += cfg.weight_decay * p;
+        g += weight_decay * p;
     }
 
     if (cfg.momentum_type == Classic) {
-        v = g + cfg.momentum * v;
-        g = v * cfg.lr;
+        v = g + momentum * v;
+        g = v * lr;
     } else if (cfg.momentum_type == Nesterov) {
-        v = g + cfg.momentum * v;
-        g = (g + cfg.momentum * v) * cfg.lr;
+        v = g + momentum * v;
+        g = (g + momentum * v) * lr;
     } else {
-        g *= cfg.lr;
+        g *= lr;
     }
 
     if (cfg.weight_decay_type == Decoupled) {
-        g += cfg.weight_decay * cfg.lr * p;
+        g += weight_decay * lr * p;
     }
 
     velocity[i] = v;
@@ -61,7 +66,7 @@ __device__ void sgd_update(
 
 #define SGD(TYPENAME, FN) \
 extern "C" __global__ void FN( \
-    const SgdConfig<TYPENAME> cfg, \
+    const SgdConfig cfg, \
     const size_t numel, \
     TYPENAME* param, \
     TYPENAME* velocity, \
@@ -70,5 +75,6 @@ extern "C" __global__ void FN( \
     sgd_update(cfg, numel, param, velocity, grad); \
 }
 
+SGD(__half, sgd_update_f16);
 SGD(float, sgd_update_f32);
 SGD(double, sgd_update_f64);
diff --git a/src/shapes/shape.rs b/src/shapes/shape.rs
index df77aeeca..2adaf92b2 100644
--- a/src/shapes/shape.rs
+++ b/src/shapes/shape.rs
@@ -47,6 +47,8 @@ unit!(i64, 1);
 unit!(u128, 1);
 unit!(i128, 1);
 unit!(bool, true);
+#[cfg(feature = "f16")]
+unit!(half::f16, half::f16::ONE);
 
 /// Represents something that has a [Unit].
 pub trait HasUnitType {
@@ -85,6 +87,8 @@ impl Dtype for u32 {}
 impl Dtype for u64 {}
 impl Dtype for u128 {}
 impl Dtype for usize {}
+#[cfg(feature = "f16")]
+impl Dtype for half::f16 {}
 
 /// Represents something that has a [Dtype].
 pub trait HasDtype {
diff --git a/src/tensor/mod.rs b/src/tensor/mod.rs
index e9eb2f9f3..3ff661729 100644
--- a/src/tensor/mod.rs
+++ b/src/tensor/mod.rs
@@ -184,7 +184,7 @@ pub use gradients::{Gradients, Merge, NoneTape, OwnedTape, Tape};
 mod tests {
     use super::*;
     use crate::shapes::*;
-    use crate::tests::{TestDevice, TestDtype};
+    use crate::tests::*;
     use std::collections::HashSet;
 
     #[test]
@@ -283,80 +283,64 @@ mod tests {
     #[test]
     fn test_upper_tri() {
         let dev: TestDevice = Default::default();
-        let vl: TestDtype = 42.0;
+        let a: TestDtype = NumCast::from(42.0).unwrap();
+        let z = TestDtype::zero();
 
-        assert_eq!(dev.upper_tri::<Rank0>(vl, None).array(), vl);
-        assert_eq!(dev.upper_tri::<Rank0>(vl, 1).array(), 0.);
+        assert_eq!(dev.upper_tri::<Rank0>(a, None).array(), a);
+        assert_eq!(dev.upper_tri::<Rank0>(a, 1).array(), z);
+        assert_eq!(dev.upper_tri::<Rank1<3>>(a, None).array(), [a, a, a]);
+        assert_eq!(dev.upper_tri::<Rank1<3>>(a, 1).array(), [z, a, a]);
 
-        assert_eq!(dev.upper_tri::<Rank1<3>>(vl, None).array(), [vl, vl, vl]);
-        assert_eq!(dev.upper_tri::<Rank1<3>>(vl, 1).array(), [0., vl, vl]);
-
-        assert_eq!(
-            dev.upper_tri::<Rank2<3, 4>>(vl, None).array(),
-            [[vl, vl, vl, vl], [0., vl, vl, vl], [0., 0., vl, vl]]
-        );
-        assert_eq!(
-            dev.upper_tri::<Rank2<3, 1>>(vl, None).array(),
-            [[vl], [0.], [0.]]
-        );
         assert_eq!(
-            dev.upper_tri::<Rank2<3, 1>>(vl, 1).array(),
-            [[0.], [0.], [0.]]
+            dev.upper_tri::<Rank2<3, 4>>(a, None).array(),
+            [[a, a, a, a], [z, a, a, a], [z, z, a, a]]
         );
         assert_eq!(
-            dev.upper_tri::<Rank2<3, 1>>(vl, -1).array(),
-            [[vl], [vl], [0.]]
+            dev.upper_tri::<Rank2<3, 1>>(a, None).array(),
+            [[a], [z], [z]]
         );
+        assert_eq!(dev.upper_tri::<Rank2<3, 1>>(a, 1).array(), [[z], [z], [z]]);
+        assert_eq!(dev.upper_tri::<Rank2<3, 1>>(a, -1).array(), [[a], [a], [z]]);
         assert_eq!(
-            dev.upper_tri::<Rank2<4, 4>>(vl, -1).array(),
-            [
-                [vl, vl, vl, vl],
-                [vl, vl, vl, vl],
-                [0., vl, vl, vl],
-                [0., 0., vl, vl]
-            ]
+            dev.upper_tri::<Rank2<4, 4>>(a, -1).array(),
+            [[a, a, a, a], [a, a, a, a], [z, a, a, a], [z, z, a, a]]
         );
         assert_eq!(
-            dev.upper_tri::<Rank2<4, 4>>(vl, -2).array(),
-            [
-                [vl, vl, vl, vl],
-                [vl, vl, vl, vl],
-                [vl, vl, vl, vl],
-                [0., vl, vl, vl]
-            ]
+            dev.upper_tri::<Rank2<4, 4>>(a, -2).array(),
+            [[a, a, a, a], [a, a, a, a], [a, a, a, a], [z, a, a, a]]
         );
         assert_eq!(
-            dev.upper_tri::<Rank2<4, 3>>(vl, 1).array(),
-            [[0., vl, vl], [0., 0., vl], [0., 0., 0.], [0., 0., 0.]]
+            dev.upper_tri::<Rank2<4, 3>>(a, 1).array(),
+            [[z, a, a], [z, z, a], [z, z, z], [z, z, z]]
         );
         assert_eq!(
-            dev.upper_tri::<Rank3<2, 5, 5>>(vl, None).array(),
+            dev.upper_tri::<Rank3<2, 5, 5>>(a, None).array(),
             [[
-                [vl, vl, vl, vl, vl],
-                [0., vl, vl, vl, vl],
-                [0., 0., vl, vl, vl],
-                [0., 0., 0., vl, vl],
-                [0., 0., 0., 0., vl]
+                [a, a, a, a, a],
+                [z, a, a, a, a],
+                [z, z, a, a, a],
+                [z, z, z, a, a],
+                [z, z, z, z, a]
             ]; 2]
         );
         assert_eq!(
-            dev.upper_tri::<Rank3<4, 5, 5>>(vl, 2).array(),
+            dev.upper_tri::<Rank3<4, 5, 5>>(a, 2).array(),
             [[
-                [0., 0., vl, vl, vl],
-                [0., 0., 0., vl, vl],
-                [0., 0., 0., 0., vl],
-                [0., 0., 0., 0., 0.],
-                [0., 0., 0., 0., 0.]
+                [z, z, a, a, a],
+                [z, z, z, a, a],
+                [z, z, z, z, a],
+                [z, z, z, z, z],
+                [z, z, z, z, z]
             ]; 4]
         );
         assert_eq!(
-            dev.upper_tri::<Rank4<3, 4, 5, 6>>(vl, None).array(),
+            dev.upper_tri::<Rank4<3, 4, 5, 6>>(a, None).array(),
             [[[
-                [vl, vl, vl, vl, vl, vl],
-                [0., vl, vl, vl, vl, vl],
-                [0., 0., vl, vl, vl, vl],
-                [0., 0., 0., vl, vl, vl],
-                [0., 0., 0., 0., vl, vl]
+                [a, a, a, a, a, a],
+                [z, a, a, a, a, a],
+                [z, z, a, a, a, a],
+                [z, z, z, a, a, a],
+                [z, z, z, z, a, a]
             ]; 4]; 3]
         );
     }
@@ -364,80 +348,64 @@ mod tests {
     #[test]
     fn test_lower_tri() {
         let dev: TestDevice = Default::default();
-        let vl: TestDtype = 42.0;
+        let a: TestDtype = NumCast::from(42.0).unwrap();
+        let z = TestDtype::zero();
 
-        assert_eq!(dev.lower_tri::<Rank0>(vl, None).array(), vl);
-        assert_eq!(dev.lower_tri::<Rank0>(vl, -1).array(), 0.);
+        assert_eq!(dev.lower_tri::<Rank0>(a, None).array(), a);
+        assert_eq!(dev.lower_tri::<Rank0>(a, -1).array(), z);
+        assert_eq!(dev.lower_tri::<Rank1<3>>(a, None).array(), [a, z, z]);
+        assert_eq!(dev.lower_tri::<Rank1<3>>(a, 1).array(), [a, a, z]);
 
-        assert_eq!(dev.lower_tri::<Rank1<3>>(vl, None).array(), [vl, 0., 0.]);
-        assert_eq!(dev.lower_tri::<Rank1<3>>(vl, 1).array(), [vl, vl, 0.]);
-
-        assert_eq!(
-            dev.lower_tri::<Rank2<3, 4>>(vl, None).array(),
-            [[vl, 0., 0., 0.], [vl, vl, 0., 0.], [vl, vl, vl, 0.]]
-        );
-        assert_eq!(
-            dev.lower_tri::<Rank2<3, 1>>(vl, None).array(),
-            [[vl], [vl], [vl]]
-        );
         assert_eq!(
-            dev.lower_tri::<Rank2<3, 1>>(vl, 1).array(),
-            [[vl], [vl], [vl]]
+            dev.lower_tri::<Rank2<3, 4>>(a, None).array(),
+            [[a, z, z, z], [a, a, z, z], [a, a, a, z]]
         );
         assert_eq!(
-            dev.lower_tri::<Rank2<3, 1>>(vl, -1).array(),
-            [[0.], [vl], [vl]]
+            dev.lower_tri::<Rank2<3, 1>>(a, None).array(),
+            [[a], [a], [a]]
         );
+        assert_eq!(dev.lower_tri::<Rank2<3, 1>>(a, 1).array(), [[a], [a], [a]]);
+        assert_eq!(dev.lower_tri::<Rank2<3, 1>>(a, -1).array(), [[z], [a], [a]]);
         assert_eq!(
-            dev.lower_tri::<Rank2<4, 4>>(vl, -1).array(),
-            [
-                [0., 0., 0., 0.],
-                [vl, 0., 0., 0.],
-                [vl, vl, 0., 0.],
-                [vl, vl, vl, 0.]
-            ]
+            dev.lower_tri::<Rank2<4, 4>>(a, -1).array(),
+            [[z, z, z, z], [a, z, z, z], [a, a, z, z], [a, a, a, z]]
         );
         assert_eq!(
-            dev.lower_tri::<Rank2<4, 4>>(vl, -2).array(),
-            [
-                [0., 0., 0., 0.],
-                [0., 0., 0., 0.],
-                [vl, 0., 0., 0.],
-                [vl, vl, 0., 0.]
-            ]
+            dev.lower_tri::<Rank2<4, 4>>(a, -2).array(),
+            [[z, z, z, z], [z, z, z, z], [a, z, z, z], [a, a, z, z]]
         );
         assert_eq!(
-            dev.lower_tri::<Rank2<4, 3>>(vl, 1).array(),
-            [[vl, vl, 0.], [vl, vl, vl], [vl, vl, vl], [vl, vl, vl]]
+            dev.lower_tri::<Rank2<4, 3>>(a, 1).array(),
+            [[a, a, z], [a, a, a], [a, a, a], [a, a, a]]
         );
         assert_eq!(
-            dev.lower_tri::<Rank3<2, 5, 5>>(vl, None).array(),
+            dev.lower_tri::<Rank3<2, 5, 5>>(a, None).array(),
             [[
-                [vl, 0., 0., 0., 0.],
-                [vl, vl, 0., 0., 0.],
-                [vl, vl, vl, 0., 0.],
-                [vl, vl, vl, vl, 0.],
-                [vl, vl, vl, vl, vl]
+                [a, z, z, z, z],
+                [a, a, z, z, z],
+                [a, a, a, z, z],
+                [a, a, a, a, z],
+                [a, a, a, a, a]
             ]; 2]
         );
         assert_eq!(
-            dev.lower_tri::<Rank3<4, 5, 5>>(vl, 2).array(),
+            dev.lower_tri::<Rank3<4, 5, 5>>(a, 2).array(),
             [[
-                [vl, vl, vl, 0., 0.],
-                [vl, vl, vl, vl, 0.],
-                [vl, vl, vl, vl, vl],
-                [vl, vl, vl, vl, vl],
-                [vl, vl, vl, vl, vl]
+                [a, a, a, z, z],
+                [a, a, a, a, z],
+                [a, a, a, a, a],
+                [a, a, a, a, a],
+                [a, a, a, a, a]
             ]; 4]
         );
         assert_eq!(
-            dev.lower_tri::<Rank4<3, 4, 5, 6>>(vl, None).array(),
+            dev.lower_tri::<Rank4<3, 4, 5, 6>>(a, None).array(),
             [[[
-                [vl, 0., 0., 0., 0., 0.],
-                [vl, vl, 0., 0., 0., 0.],
-                [vl, vl, vl, 0., 0., 0.],
-                [vl, vl, vl, vl, 0., 0.],
-                [vl, vl, vl, vl, vl, 0.]
+                [a, z, z, z, z, z],
+                [a, a, z, z, z, z],
+                [a, a, a, z, z, z],
+                [a, a, a, a, z, z],
+                [a, a, a, a, a, z]
             ]; 4]; 3]
         );
     }
diff --git a/src/tensor_ops/abs/abs.cu b/src/tensor_ops/abs/abs.cu
index 045e8fa15..cc6773a6d 100644
--- a/src/tensor_ops/abs/abs.cu
+++ b/src/tensor_ops/abs/abs.cu
@@ -2,10 +2,14 @@
 
 struct AbsKernelOp {};
 
+UNARY_OP(__half, abs_fwd_f16, abs_bwd_f16, AbsKernelOp,
+        absg(x),
+        x == __float2half(0.0) ? __float2half(0.0) : copysigng(__float2half(1.0), x));
+
 UNARY_OP(float, abs_fwd_f32, abs_bwd_f32, AbsKernelOp,
-        fabsf(x),
-        x == 0.0 ? 0.0 : copysignf(1.0, x));
+        absg(x),
+        x == 0.0 ? 0.0f : copysigng(1.0f, x));
 
 UNARY_OP(double, abs_fwd_f64, abs_bwd_f64, AbsKernelOp,
-        fabs(x),
-        x == 0.0 ? 0.0 : copysign(1.0, x));
+        absg(x),
+        x == 0.0 ? 0.0 : copysigng(1.0, x));
diff --git a/src/tensor_ops/abs/cuda_kernel.rs b/src/tensor_ops/abs/cuda_kernel.rs
index 275738ced..773da7c54 100644
--- a/src/tensor_ops/abs/cuda_kernel.rs
+++ b/src/tensor_ops/abs/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for AbsKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/abs.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(AbsKernelOp, half::f16, PTX, "abs_fwd_f16", "abs_bwd_f16");
 cuda_unary!(AbsKernelOp, f32, PTX, "abs_fwd_f32", "abs_bwd_f32");
 cuda_unary!(AbsKernelOp, f64, PTX, "abs_fwd_f64", "abs_bwd_f64");
diff --git a/src/tensor_ops/abs/mod.rs b/src/tensor_ops/abs/mod.rs
index 61ca7a3a7..6d4d663dd 100644
--- a/src/tensor_ops/abs/mod.rs
+++ b/src/tensor_ops/abs/mod.rs
@@ -46,7 +46,9 @@ mod tests {
     #[test]
     fn test_abs() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().abs();
         assert_close_to_literal!(r, [2.0, 1.0, 0.0, 1.0, 2.0]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/add/binary_add.cu b/src/tensor_ops/add/binary_add.cu
index de6ca68db..d749bc13b 100644
--- a/src/tensor_ops/add/binary_add.cu
+++ b/src/tensor_ops/add/binary_add.cu
@@ -2,6 +2,11 @@
 
 struct BinaryAddOp {};
 
+BINARY_OP(__half, badd_fwd_f16, badd_bwd_lhs_f16, badd_bwd_rhs_f16, BinaryAddOp,
+    x + y,
+    1.0,
+    1.0)
+
 BINARY_OP(float, badd_fwd_f32, badd_bwd_lhs_f32, badd_bwd_rhs_f32, BinaryAddOp,
     x + y,
     1.0,
diff --git a/src/tensor_ops/add/cuda_kernel.rs b/src/tensor_ops/add/cuda_kernel.rs
index ce87ac1bd..a2b399643 100644
--- a/src/tensor_ops/add/cuda_kernel.rs
+++ b/src/tensor_ops/add/cuda_kernel.rs
@@ -1,6 +1,8 @@
 use super::{BinaryAddKernelOp as Binary, ScalarAddKernelOp as Scalar};
 use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary};
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for Scalar<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f64> {}
 unsafe impl cudarc::driver::DeviceRepr for Binary {}
@@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {}
 const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_add.ptx"));
 const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_add.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(const_df() Scalar<half::f16>, half::f16, SCALAR_PTX, "sadd_fwd_f16", "sadd_bwd_f16");
 cuda_unary!(const_df() Scalar<f32>, f32, SCALAR_PTX, "sadd_fwd_f32", "sadd_bwd_f32");
 cuda_unary!(const_df() Scalar<f64>, f64, SCALAR_PTX, "sadd_fwd_f64", "sadd_bwd_f64");
+#[cfg(feature = "f16")]
+cuda_binary!(
+    const_df() Binary,
+    half::f16,
+    BINARY_PTX,
+    "badd_fwd_f16",
+    "badd_bwd_lhs_f16",
+    "badd_bwd_rhs_f16"
+);
 cuda_binary!(
     const_df() Binary,
     f32,
diff --git a/src/tensor_ops/add/mod.rs b/src/tensor_ops/add/mod.rs
index 402cf8952..0ab9dd5e6 100644
--- a/src/tensor_ops/add/mod.rs
+++ b/src/tensor_ops/add/mod.rs
@@ -74,6 +74,17 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ScalarAddKernelOp<E>, E>, T: Tape<E, D>>
     }
 }
 
+#[cfg(feature = "f16")]
+impl<S: Shape, D: UnaryKernel<ScalarAddKernelOp<half::f16>, half::f16>, T: Tape<half::f16, D>>
+    TryAdd<f32> for Tensor<S, half::f16, D, T>
+{
+    /// See [add]
+    fn try_add(self, rhs: f32) -> Result<Self, Self::Err> {
+        let scalar = half::f16::from_f32(rhs);
+        try_unary_op(ScalarAddKernelOp { scalar }, self)
+    }
+}
+
 impl<S: Shape, E: Dtype, D: DeviceStorage, LhsTape: Tape<E, D>, Rhs> std::ops::Add<Rhs>
     for Tensor<S, E, D, LhsTape>
 where
@@ -93,8 +104,8 @@ mod tests {
     #[test]
     fn test_add_0d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor(1.0);
-        let b: Tensor<_, TestDtype, _> = dev.tensor(1.0);
+        let a = dev.tensor(1.0f64).to_dtype::<TestDtype>();
+        let b = dev.tensor(1.0f64).to_dtype::<TestDtype>();
 
         let r = a.leaky_trace() + b.clone();
         assert_close_to_literal!(r, 2.0);
@@ -106,8 +117,8 @@ mod tests {
     #[test]
     fn test_add_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]);
+        let a = dev.tensor([1.0f64, 2.0, 3.0]).to_dtype::<TestDtype>();
+        let b = dev.tensor([1.0f64, -1.0, 0.0]).to_dtype::<TestDtype>();
 
         let r = a.leaky_trace() + b.clone();
         assert_close_to_literal!(r, [2.0, 1.0, 3.0]);
@@ -119,10 +130,12 @@ mod tests {
     #[test]
     fn test_add_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]);
+        let a = dev
+            .tensor([[0.6570f64, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.5199f64, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]])
+            .to_dtype::<TestDtype>();
 
         let r = a.leaky_trace() + b.clone();
         assert_close_to_literal!(r, [[1.1769, 0.5552, 0.5259], [1.3917, 1.0692, 0.873]]);
@@ -134,10 +147,12 @@ mod tests {
     #[test]
     fn test_add_broadcast_bottom() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]);
+        let a = dev
+            .tensor([[0.6570f64, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.5199f64, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]])
+            .to_dtype::<TestDtype>();
 
         let a2 = a.broadcast::<Rank3<2, 3, 4>, _>();
         let b2 = b.broadcast::<Rank3<2, 3, 4>, _>();
@@ -158,10 +173,12 @@ mod tests {
     #[test]
     fn test_add_broadcast_top() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]);
+        let a = dev
+            .tensor([[0.6570f64, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.5199f64, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]])
+            .to_dtype::<TestDtype>();
 
         let a2 = a.broadcast::<Rank3<4, 2, 3>, _>();
         let b2 = b.broadcast::<Rank3<4, 2, 3>, _>();
@@ -176,7 +193,7 @@ mod tests {
     #[test]
     fn test_scalar_add_0d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor(0.0);
+        let x: Tensor<(), TestDtype, _> = dev.zeros();
         let r = x.leaky_trace() + 1.0;
         assert_close_to_literal!(r, 1.0);
         let g = r.exp().backward();
@@ -186,7 +203,7 @@ mod tests {
     #[test]
     fn test_scalar_add_1d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]);
+        let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() + 0.5;
         assert_close_to_literal!(r, [0.5, 1.5, 2.5]);
         let g = r.exp().sum().backward();
@@ -196,7 +213,7 @@ mod tests {
     #[test]
     fn test_scalar_add_2d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[0.0; 2]; 3]);
+        let x = dev.tensor([[0.0; 2]; 3]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() + 0.5;
         assert_close_to_literal!(r, [[0.5; 2]; 3]);
         let g = r.exp().sum().backward();
diff --git a/src/tensor_ops/add/scalar_add.cu b/src/tensor_ops/add/scalar_add.cu
index d82a3c7a7..3a3c7d63d 100644
--- a/src/tensor_ops/add/scalar_add.cu
+++ b/src/tensor_ops/add/scalar_add.cu
@@ -5,6 +5,10 @@ struct ScalarAddKernelOp {
     F scalar;
 };
 
+UNARY_OP(__half, sadd_fwd_f16, sadd_bwd_f16, ScalarAddKernelOp<__half>,
+    x + op.scalar,
+    1.0);
+
 UNARY_OP(float, sadd_fwd_f32, sadd_bwd_f32, ScalarAddKernelOp<float>,
     x + op.scalar,
     1.0);
diff --git a/src/tensor_ops/attention_reshape/attention_reshape.cu b/src/tensor_ops/attention_reshape/attention_reshape.cu
index 763d3e142..27c51b5cd 100644
--- a/src/tensor_ops/attention_reshape/attention_reshape.cu
+++ b/src/tensor_ops/attention_reshape/attention_reshape.cu
@@ -64,6 +64,17 @@ __device__ void attention_reshape(
     }
 }
 
+extern "C" __global__ void attention_reshape_f16(
+    const AttentionReshapeOp op,
+    const __half *qkv,
+    const __half *past_key,
+    const __half *past_value,
+    __half *query,
+    __half *key,
+    __half *value
+) {
+    attention_reshape(op, qkv, past_key, past_value, query, key, value);
+}
 
 extern "C" __global__ void attention_reshape_f32(
     const AttentionReshapeOp op,
diff --git a/src/tensor_ops/attention_reshape/cuda_kernel.rs b/src/tensor_ops/attention_reshape/cuda_kernel.rs
index a051cbd95..b4a7b37b1 100644
--- a/src/tensor_ops/attention_reshape/cuda_kernel.rs
+++ b/src/tensor_ops/attention_reshape/cuda_kernel.rs
@@ -19,6 +19,11 @@ trait HasCudaKernel<E: Unit> {
     const FN: &'static str;
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const FN: &'static str = "attention_reshape_f16";
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const FN: &'static str = "attention_reshape_f32";
 }
diff --git a/src/tensor_ops/attention_reshape/mod.rs b/src/tensor_ops/attention_reshape/mod.rs
index ad600a1aa..7e26fcb70 100644
--- a/src/tensor_ops/attention_reshape/mod.rs
+++ b/src/tensor_ops/attention_reshape/mod.rs
@@ -89,7 +89,7 @@ impl<E: Dtype, D: AttentionReshapeKernel<E>> TryAttentionReshape<E> for D {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tests::*;
+    use crate::{tensor_ops::*, tests::*};
 
     #[test]
     fn test_attention_reshape() {
@@ -100,37 +100,57 @@ mod tests {
         let sequence_length = 1;
         let past_length = 3;
 
-        {
-            let qkv: Tensor<(usize, Const<{ NUM_HEADS * HEAD_DIM * 3 }>), TestDtype, _> =
-                dev.zeros_like(&(sequence_length, Const)) + 1.0;
-            let past_key: Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), TestDtype, _> =
-                dev.zeros_like(&(Const, Const, past_length)) + 2.0;
-            let past_value: Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), TestDtype, _> =
-                dev.zeros_like(&(Const, past_length, Const)) + 3.0;
+        let qkv: Tensor<(usize, Const<{ NUM_HEADS * HEAD_DIM * 3 }>), TestDtype, _> =
+            dev.zeros_like(&(sequence_length, Const)) + 1.0;
+        let past_key: Tensor<(Const<NUM_HEADS>, Const<HEAD_DIM>, usize), TestDtype, _> =
+            dev.zeros_like(&(Const, Const, past_length)) + 2.0;
+        let past_value: Tensor<(Const<NUM_HEADS>, usize, Const<HEAD_DIM>), TestDtype, _> =
+            dev.zeros_like(&(Const, past_length, Const)) + 3.0;
 
-            let (q, k, v) = dev.attention_reshape(&qkv, &past_key, &past_value);
+        let (q, k, v) = dev.attention_reshape(&qkv, &past_key, &past_value);
 
-            assert_eq!(q.as_vec(), std::vec![1.0; 6]);
-            #[rustfmt::skip]
-        assert_eq!(
-            k.as_vec(),
-            std::vec![
-                2.0, 2.0, 2.0, 1.0,
-                2.0, 2.0, 2.0, 1.0,
-                2.0, 2.0, 2.0, 1.0,
-                2.0, 2.0, 2.0, 1.0,
-                2.0, 2.0, 2.0, 1.0,
-                2.0, 2.0, 2.0, 1.0
+        let q = q
+            .realize::<(Const<NUM_HEADS>, Const<1>, Const<HEAD_DIM>)>()
+            .unwrap();
+        let k = k
+            .realize::<(Const<NUM_HEADS>, Const<HEAD_DIM>, Const<4>)>()
+            .unwrap();
+        let v = v
+            .realize::<(Const<NUM_HEADS>, Const<4>, Const<HEAD_DIM>)>()
+            .unwrap();
+
+        assert_close_to_literal!(q, [[[1.0; HEAD_DIM]; 1]; NUM_HEADS]);
+        assert_close_to_literal!(
+            k,
+            [
+                [
+                    [2.0, 2.0, 2.0, 1.0],
+                    [2.0, 2.0, 2.0, 1.0],
+                    [2.0, 2.0, 2.0, 1.0]
+                ],
+                [
+                    [2.0, 2.0, 2.0, 1.0],
+                    [2.0, 2.0, 2.0, 1.0],
+                    [2.0, 2.0, 2.0, 1.0]
+                ]
             ]
         );
-            #[rustfmt::skip]
-        assert_eq!(
-            v.as_vec(),
-            std::vec![
-                3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 1.0, 1.0,
-                3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 1.0, 1.0
+        assert_close_to_literal!(
+            v,
+            [
+                [
+                    [3.0, 3.0, 3.0],
+                    [3.0, 3.0, 3.0],
+                    [3.0, 3.0, 3.0],
+                    [1.0, 1.0, 1.0]
+                ],
+                [
+                    [3.0, 3.0, 3.0],
+                    [3.0, 3.0, 3.0],
+                    [3.0, 3.0, 3.0],
+                    [1.0, 1.0, 1.0]
+                ]
             ]
         );
-        }
     }
 }
diff --git a/src/tensor_ops/axpy/axpy.cu b/src/tensor_ops/axpy/axpy.cu
index 487541a97..9e6907757 100644
--- a/src/tensor_ops/axpy/axpy.cu
+++ b/src/tensor_ops/axpy/axpy.cu
@@ -1,3 +1,5 @@
+#include "cuda_fp16.h"
+
 template<typename T>
 __device__ void axpy(const size_t n, T* a, const T alpha, const T* b, const T beta) {
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7,6 +9,10 @@ __device__ void axpy(const size_t n, T* a, const T alpha, const T* b, const T be
     a[i] = a[i] * alpha + b[i] * beta;
 }
 
+extern "C" __global__ void axpy_f16(const size_t n, __half* a, const __half alpha, const __half* b, const __half beta) {
+    axpy(n, a, alpha, b, beta);
+}
+
 extern "C" __global__ void axpy_f32(const size_t n, float* a, const float alpha, const float* b, const float beta) {
     axpy(n, a, alpha, b, beta);
 }
diff --git a/src/tensor_ops/axpy/cuda_kernel.rs b/src/tensor_ops/axpy/cuda_kernel.rs
index 91b371ca8..57841a32d 100644
--- a/src/tensor_ops/axpy/cuda_kernel.rs
+++ b/src/tensor_ops/axpy/cuda_kernel.rs
@@ -10,6 +10,10 @@ const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/axpy.ptx"));
 trait HasCudaKernel<E> {
     const FN: &'static str;
 }
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const FN: &'static str = "axpy_f16";
+}
 impl HasCudaKernel<f32> for Cuda {
     const FN: &'static str = "axpy_f32";
 }
diff --git a/src/tensor_ops/axpy/mod.rs b/src/tensor_ops/axpy/mod.rs
index 8299eb2ae..b78c3ea45 100644
--- a/src/tensor_ops/axpy/mod.rs
+++ b/src/tensor_ops/axpy/mod.rs
@@ -1,5 +1,5 @@
 use crate::{
-    shapes::{Shape, Unit},
+    shapes::{Dtype, Shape},
     tensor::{DeviceStorage, Tensor},
 };
 
@@ -10,11 +10,11 @@ mod cuda_kernel;
 /// Elementwise `a * alpha + b * beta`.
 ///
 /// See [Tensor::axpy] for in place version.
-pub fn axpy<S: Shape, E: Unit, D>(
+pub fn axpy<S: Shape, E: Dtype, D>(
     a: &Tensor<S, E, D>,
-    alpha: impl Into<E>,
+    alpha: impl Into<f64>,
     b: &Tensor<S, E, D>,
-    beta: impl Into<E>,
+    beta: impl Into<f64>,
 ) -> Tensor<S, E, D>
 where
     D: AxpyKernel<E>,
@@ -24,31 +24,31 @@ where
     dst
 }
 
-impl<S: Shape, E: Unit, D: AxpyKernel<E>> Tensor<S, E, D> {
+impl<S: Shape, E: Dtype, D: AxpyKernel<E>> Tensor<S, E, D> {
     /// Updates self with elementwise function `self = self * alpha + b * beta`.
-    pub fn axpy<T>(&mut self, alpha: impl Into<E>, b: &Tensor<S, E, D, T>, beta: impl Into<E>) {
+    pub fn axpy<T>(&mut self, alpha: impl Into<f64>, b: &Tensor<S, E, D, T>, beta: impl Into<f64>) {
         self.try_axpy(alpha, b, beta).unwrap()
     }
 
     /// Updates self with elementwise function `self = self * alpha + b * beta`.
     pub fn try_axpy<T>(
         &mut self,
-        alpha: impl Into<E>,
+        alpha: impl Into<f64>,
         b: &Tensor<S, E, D, T>,
-        beta: impl Into<E>,
+        beta: impl Into<f64>,
     ) -> Result<(), D::Err> {
         assert_eq!(self.shape, b.shape);
         assert_eq!(self.strides, b.strides, "Strides must be equal for axpy");
         self.device.clone().forward(
             std::sync::Arc::make_mut(&mut self.data),
-            alpha.into(),
+            E::from_f64(alpha.into()).unwrap(),
             b.data.as_ref(),
-            beta.into(),
+            E::from_f64(beta.into()).unwrap(),
         )
     }
 }
 
-pub trait AxpyKernel<E: Unit>: DeviceStorage {
+pub trait AxpyKernel<E: Dtype>: DeviceStorage {
     fn forward(
         &self,
         a: &mut Self::Vec<E>,
@@ -85,8 +85,10 @@ mod tests {
     fn test_axpy() {
         let dev: TestDevice = Default::default();
 
-        let mut a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]; 2]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([[-1.5; 5], [1.5; 5]]);
+        let mut a = dev
+            .tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]; 2])
+            .to_dtype::<TestDtype>();
+        let b = dev.tensor([[-1.5; 5], [1.5; 5]]).to_dtype::<TestDtype>();
 
         a.axpy(0.01, &b, 0.99);
 
diff --git a/src/tensor_ops/bce/bce.cu b/src/tensor_ops/bce/bce.cu
index 081f13249..dc1d7de3d 100644
--- a/src/tensor_ops/bce/bce.cu
+++ b/src/tensor_ops/bce/bce.cu
@@ -5,12 +5,14 @@ struct BCEKernelOp {};
 template<typename T>
 __device__ T op_f(T logit, T prob) {
     T zero = 0.0;
-    return maxg(logit, zero) - logit * prob + logg(1.0 + expg(-absg(logit)));
+    T one = 1.0;
+    return maxg(logit, zero) - logit * prob + logg(one + expg(-absg(logit)));
 }
 
 template<typename T>
 __device__ T op_dfdx(T logit, T prob) {
-    return 1.0 - prob - 1 / (1.0 + expg(logit));
+    T one = 1.0;
+    return one - prob - one / (one + expg(logit));
 }
 
 template<typename T>
@@ -18,6 +20,12 @@ __device__ T op_dfdy(T logit, T prob) {
     return -logit;
 }
 
+BINARY_OP(__half, bce_fwd_f16, bce_bwd_lhs_f16, bce_bwd_rhs_f16, BCEKernelOp,
+    __float2half(op_f(__half2float(x), __half2float(y))),
+    op_dfdx(x, y),
+    op_dfdy(x, y)
+)
+
 BINARY_OP(float, bce_fwd_f32, bce_bwd_lhs_f32, bce_bwd_rhs_f32, BCEKernelOp,
     op_f(x, y),
     op_dfdx(x, y),
diff --git a/src/tensor_ops/bce/cuda_kernel.rs b/src/tensor_ops/bce/cuda_kernel.rs
index 35c091de9..f55d2ac43 100644
--- a/src/tensor_ops/bce/cuda_kernel.rs
+++ b/src/tensor_ops/bce/cuda_kernel.rs
@@ -5,6 +5,15 @@ unsafe impl cudarc::driver::DeviceRepr for BCEKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/bce.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_binary!(
+    BCEKernelOp,
+    half::f16,
+    PTX,
+    "bce_fwd_f16",
+    "bce_bwd_lhs_f16",
+    "bce_bwd_rhs_f16"
+);
 cuda_binary!(
     BCEKernelOp,
     f32,
diff --git a/src/tensor_ops/bce/mod.rs b/src/tensor_ops/bce/mod.rs
index 545cd2798..3053f045f 100644
--- a/src/tensor_ops/bce/mod.rs
+++ b/src/tensor_ops/bce/mod.rs
@@ -61,14 +61,18 @@ mod tests {
     #[test]
     fn test_bce() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([
-            [-0.8424031, 0.6309481, 1.0416432],
-            [1.325225, 0.5840275, 1.9167633],
-        ]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.52022195, 0.578804, 0.17535722],
-            [0.75429636, 0.66566986, 0.6182751],
-        ]);
+        let a = dev
+            .tensor([
+                [-0.8424031, 0.6309481, 1.0416432],
+                [1.325225, 0.5840275, 1.9167633],
+            ])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([
+                [0.52022195, 0.578804, 0.17535722],
+                [0.75429636, 0.66566986, 0.6182751],
+            ])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().bce_with_logits(b);
         assert_close_to_literal!(
             r,
diff --git a/src/tensor_ops/choose/choose.cu b/src/tensor_ops/choose/choose.cu
index d2e5a6187..799af246f 100644
--- a/src/tensor_ops/choose/choose.cu
+++ b/src/tensor_ops/choose/choose.cu
@@ -83,5 +83,6 @@ extern "C" __global__ void BWD( \
     choose_bwd(numel, num_dims, dims, cond, cond_strides, grad_lhs, lhs_strides, grad_rhs, rhs_strides, grad_out); \
 }
 
+CHOOSE(__half, choose_fwd_f16, choose_bwd_f16);
 CHOOSE(float, choose_fwd_f32, choose_bwd_f32);
 CHOOSE(double, choose_fwd_f64, choose_bwd_f64);
diff --git a/src/tensor_ops/choose/cuda_kernel.rs b/src/tensor_ops/choose/cuda_kernel.rs
index d9f8f8f74..6a3c12794 100644
--- a/src/tensor_ops/choose/cuda_kernel.rs
+++ b/src/tensor_ops/choose/cuda_kernel.rs
@@ -11,6 +11,12 @@ pub(crate) trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "choose_f16";
+    const FNS: &'static [&'static str] = &["choose_fwd_f16", "choose_bwd_f16"];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "choose_f32";
     const FNS: &'static [&'static str] = &["choose_fwd_f32", "choose_bwd_f32"];
diff --git a/src/tensor_ops/clamp/clamp.cu b/src/tensor_ops/clamp/clamp.cu
index d61e09aa5..f5f90b8e3 100644
--- a/src/tensor_ops/clamp/clamp.cu
+++ b/src/tensor_ops/clamp/clamp.cu
@@ -6,11 +6,15 @@ struct ClampKernelOp {
     F max;
 };
 
+UNARY_OP(__half, clamp_fwd_f16, clamp_bwd_f16, ClampKernelOp<__half>,
+    maxg(ming(x, op.max), op.min),
+    x <= op.max && x >= op.min ? 1.0 : 0.0)
+
 UNARY_OP(float, clamp_fwd_f32, clamp_bwd_f32, ClampKernelOp<float>,
-        fmaxf(fminf(x, op.max), op.min),
+        maxg(ming(x, op.max), op.min),
         x <= op.max && x >= op.min ? 1.0 : 0.0)
 
 UNARY_OP(double, clamp_fwd_f64, clamp_bwd_f64, ClampKernelOp<double>,
-    fmax(fmin(x, op.max), op.min),
+    maxg(ming(x, op.max), op.min),
     x <= op.max && x >= op.min ? 1.0 : 0.0)
     
\ No newline at end of file
diff --git a/src/tensor_ops/clamp/cuda_kernel.rs b/src/tensor_ops/clamp/cuda_kernel.rs
index 890014fef..c8abf5248 100644
--- a/src/tensor_ops/clamp/cuda_kernel.rs
+++ b/src/tensor_ops/clamp/cuda_kernel.rs
@@ -1,10 +1,20 @@
 use super::ClampKernelOp;
 use crate::tensor_ops::cuda_kernels::cuda_unary;
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for ClampKernelOp<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for ClampKernelOp<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for ClampKernelOp<f64> {}
 
 const P: &str = include_str!(concat!(env!("OUT_DIR"), "/clamp.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    ClampKernelOp<half::f16>,
+    half::f16,
+    P,
+    "clamp_fwd_f16",
+    "clamp_bwd_f16"
+);
 cuda_unary!(ClampKernelOp<f32>, f32, P, "clamp_fwd_f32", "clamp_bwd_f32");
 cuda_unary!(ClampKernelOp<f64>, f64, P, "clamp_fwd_f64", "clamp_bwd_f64");
diff --git a/src/tensor_ops/clamp/mod.rs b/src/tensor_ops/clamp/mod.rs
index ae7901fc4..b58b624aa 100644
--- a/src/tensor_ops/clamp/mod.rs
+++ b/src/tensor_ops/clamp/mod.rs
@@ -25,23 +25,23 @@ pub struct ClampKernelOp<E> {
 /// ```
 pub fn clamp<S: Shape, E: Dtype, D: UnaryKernel<ClampKernelOp<E>, E>, T: Tape<E, D>>(
     t: Tensor<S, E, D, T>,
-    min: impl Into<E>,
-    max: impl Into<E>,
+    min: impl Into<f64>,
+    max: impl Into<f64>,
 ) -> Tensor<S, E, D, T> {
     t.clamp(min, max)
 }
 
 impl<S: Shape, E: Dtype, D: UnaryKernel<ClampKernelOp<E>, E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [clamp]
-    pub fn clamp(self, min: impl Into<E>, max: impl Into<E>) -> Self {
+    pub fn clamp(self, min: impl Into<f64>, max: impl Into<f64>) -> Self {
         self.try_clamp(min, max).unwrap()
     }
     /// See [clamp]
-    pub fn try_clamp(self, min: impl Into<E>, max: impl Into<E>) -> Result<Self, D::Err> {
+    pub fn try_clamp(self, min: impl Into<f64>, max: impl Into<f64>) -> Result<Self, D::Err> {
         try_unary_op(
             ClampKernelOp {
-                min: min.into(),
-                max: max.into(),
+                min: E::from_f64(min.into()).unwrap(),
+                max: E::from_f64(max.into()).unwrap(),
             },
             self,
         )
@@ -55,7 +55,9 @@ mod tests {
     #[test]
     fn test_clamp() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[-1.0, 0.0, 1.0], [-2.0, 2.0, 1.1]]);
+        let t = dev
+            .tensor([[-1.0, 0.0, 1.0], [-2.0, 2.0, 1.1]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().clamp(-1.0, 1.0);
         assert_close_to_literal!(r, [[-1.0, 0.0, 1.0], [-1.0, 1.0, 1.0]]);
         let g = r.exp().mean().backward();
diff --git a/src/tensor_ops/cmp/cmp.cu b/src/tensor_ops/cmp/cmp.cu
index feff7e18f..bceeff44a 100644
--- a/src/tensor_ops/cmp/cmp.cu
+++ b/src/tensor_ops/cmp/cmp.cu
@@ -41,12 +41,20 @@ extern "C" __global__ void SCALAR_FWD( \
     out[out_i] = lhs[lhs_i] SYMBOL scalar; \
 }
 
+CMP_OP(__half, eq_fwd_f16, scalar_eq_fwd_f16, ==)
+CMP_OP(__half, ne_fwd_f16, scalar_ne_fwd_f16, !=)
+CMP_OP(__half, gt_fwd_f16, scalar_gt_fwd_f16, >)
+CMP_OP(__half, ge_fwd_f16, scalar_ge_fwd_f16, >=)
+CMP_OP(__half, lt_fwd_f16, scalar_lt_fwd_f16, <)
+CMP_OP(__half, le_fwd_f16, scalar_le_fwd_f16, <=)
+
 CMP_OP(float, eq_fwd_f32, scalar_eq_fwd_f32, ==)
 CMP_OP(float, ne_fwd_f32, scalar_ne_fwd_f32, !=)
 CMP_OP(float, gt_fwd_f32, scalar_gt_fwd_f32, >)
 CMP_OP(float, ge_fwd_f32, scalar_ge_fwd_f32, >=)
 CMP_OP(float, lt_fwd_f32, scalar_lt_fwd_f32, <)
 CMP_OP(float, le_fwd_f32, scalar_le_fwd_f32, <=)
+
 CMP_OP(double, eq_fwd_f64, scalar_eq_fwd_f64, ==)
 CMP_OP(double, ne_fwd_f64, scalar_ne_fwd_f64, !=)
 CMP_OP(double, gt_fwd_f64, scalar_gt_fwd_f64, >)
diff --git a/src/tensor_ops/cmp/cuda_kernels.rs b/src/tensor_ops/cmp/cuda_kernels.rs
index a9577be81..8b31b21c8 100644
--- a/src/tensor_ops/cmp/cuda_kernels.rs
+++ b/src/tensor_ops/cmp/cuda_kernels.rs
@@ -128,6 +128,19 @@ macro_rules! cmps {
     };
 }
 
+#[cfg(feature = "f16")]
+cmps!(EqKernelOp, half::f16, "eq_fwd_f16", "scalar_eq_fwd_f16");
+#[cfg(feature = "f16")]
+cmps!(NeKernelOp, half::f16, "ne_fwd_f16", "scalar_ne_fwd_f16");
+#[cfg(feature = "f16")]
+cmps!(GtKernelOp, half::f16, "gt_fwd_f16", "scalar_gt_fwd_f16");
+#[cfg(feature = "f16")]
+cmps!(GeKernelOp, half::f16, "ge_fwd_f16", "scalar_ge_fwd_f16");
+#[cfg(feature = "f16")]
+cmps!(LtKernelOp, half::f16, "lt_fwd_f16", "scalar_lt_fwd_f16");
+#[cfg(feature = "f16")]
+cmps!(LeKernelOp, half::f16, "le_fwd_f16", "scalar_le_fwd_f16");
+
 cmps!(EqKernelOp, f32, "eq_fwd_f32", "scalar_eq_fwd_f32");
 cmps!(NeKernelOp, f32, "ne_fwd_f32", "scalar_ne_fwd_f32");
 cmps!(GtKernelOp, f32, "gt_fwd_f32", "scalar_gt_fwd_f32");
diff --git a/src/tensor_ops/cmp/mod.rs b/src/tensor_ops/cmp/mod.rs
index 1ebbec88b..e9f1bbc2f 100644
--- a/src/tensor_ops/cmp/mod.rs
+++ b/src/tensor_ops/cmp/mod.rs
@@ -228,6 +228,17 @@ macro_rules! impl_cmp_kernel_op {
             }
         }
 
+        #[cfg(feature = "f16")]
+        impl<S: Shape, D: ScalarCmpKernel<$KernelOp, half::f16>, T: Tape<half::f16, D>>
+            $TraitName<f32> for Tensor<S, half::f16, D, T>
+        {
+            type Output = Tensor<S, bool, D, NoneTape>;
+            #[doc = $doc]
+            fn $TryFnName(&self, other: f32) -> Result<Self::Output, D::Err> {
+                try_scalar_cmp_op(self, half::f16::from_f32(other))
+            }
+        }
+
         impl<S: Shape, E: Unit, D: ScalarCmpKernel<$KernelOp, E>, T: Tape<E, D>>
             Tensor<S, E, D, T>
         {
@@ -307,212 +318,192 @@ impl_cmp_kernel_op!(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{shapes::*, tensor::*, tests::*};
-
-    type TestTensor<const R: usize, const C: usize, E> =
-        Tensor<(Const<R>, Const<C>), E, TestDevice>;
-
-    fn test_cmp<E: Unit, const R: usize, const C: usize, F>(
-        a: [[E; C]; R],
-        b: [[E; C]; R],
-        cmp: F,
-        expected: [[bool; C]; R],
-    ) where
-        F: Fn(&TestTensor<R, C, E>, &TestTensor<R, C, E>) -> [[bool; C]; R],
-    {
-        let dev: TestDevice = Default::default();
-        let a = dev.tensor(a);
-        let b = dev.tensor(b);
-        let r = cmp(&a, &b);
-        assert_eq!(r, expected);
-    }
-
-    fn test_scalar_cmp<E: Unit, const R: usize, const C: usize, F>(
-        a: [[E; C]; R],
-        cmp: F,
-        expected: [[bool; C]; R],
-    ) where
-        F: Fn(&TestTensor<R, C, E>) -> [[bool; C]; R],
-    {
-        let dev: TestDevice = Default::default();
-        let a = dev.tensor(a);
-        assert_eq!(cmp(&a), expected);
-    }
+    use crate::{tensor::*, tests::*};
 
     #[test]
     fn test_eq() {
         let dev: TestDevice = Default::default();
         let a = dev
-            .tensor([[1.0, 2.0, 0.0], [4.0, 5.0, 0.0]])
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]])
             .to_dtype::<TestDtype>();
+        let r = a.eq(&b);
+        assert_eq!(r.array(), [[false, true, false], [true, false, true]]);
 
+        #[cfg(not(feature = "cuda"))]
         {
-            let b = dev
-                .tensor([[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]])
-                .to_dtype::<TestDtype>();
+            let a = dev.tensor([[1, 2, 3], [0, 123, 5]]);
+            let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]);
             let r = a.eq(&b);
-            assert_eq!(r.array(), [[false, true, false], [true, false, true]]);
-        }
-
-        {
-            let r = a.eq(0.0);
-            assert_eq!(r.array(), [[false, false, true], [false, false, true]]);
+            assert_eq!(r.array(), [[false, true, false], [false, true, false]]);
         }
     }
 
     #[test]
-    fn test_ne() {
-        test_cmp::<TestDtype, 2, 3, _>(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]],
-            [[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]],
-            |a, b| a.ne(b).array(),
-            [[true, false, true], [false, true, false]],
-        );
+    fn test_scalar_eq() {
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[0.0, 1.2], [3.4, -5.6]])
+            .to_dtype::<TestDtype>();
+        let r = a.eq(1.2);
+        assert_eq!(r.array(), [[false, true], [false, false]]);
     }
 
-    // TODO Remove this attribute once Cuda supports integers
-    #[cfg(not(feature = "cuda"))]
     #[test]
-    fn test_ne_not_dtype() {
-        test_cmp(
-            [[1, 2, 3], [0, 123, 5]],
-            [[0, 2, -3], [-4, 123, 6]],
-            |a, b| a.ne(b).array(),
-            [[true, false, true], [true, false, true]],
-        );
+    fn test_ne() {
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.ne(&b);
+        assert_eq!(r.array(), [[true, false, true], [false, true, false]]);
+
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = dev.tensor([[1, 2, 3], [0, 123, 5]]);
+            let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]);
+            let r = a.ne(&b);
+            assert_eq!(r.array(), [[true, false, true], [true, false, true]]);
+        }
     }
 
     #[test]
     fn test_scalar_ne() {
-        test_scalar_cmp::<TestDtype, 2, 2, _>(
-            [[0.0, 1.2], [3.4, -5.6]],
-            |a| a.ne(1.2).array(),
-            [[true, false], [true, true]],
-        );
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[0.0, 1.2], [3.4, -5.6]])
+            .to_dtype::<TestDtype>();
+        let r = a.ne(1.2);
+        assert_eq!(r.array(), [[true, false], [true, true]]);
     }
 
     #[test]
     fn test_gt() {
-        test_cmp::<TestDtype, 2, 3, _>(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]],
-            [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]],
-            |a, b| a.gt(b).array(),
-            [[true, false, false], [true, true, false]],
-        );
-    }
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.gt(&b);
+        assert_eq!(r.array(), [[true, false, false], [true, true, false]]);
 
-    // TODO Remove this attribute once Cuda supports integers
-    #[cfg(not(feature = "cuda"))]
-    #[test]
-    fn test_gt_not_dtype() {
-        test_cmp(
-            [[1, 2, 3], [0, 123, 5]],
-            [[0, 2, -3], [-4, 123, 6]],
-            |a, b| a.gt(b).array(),
-            [[true, false, true], [true, false, false]],
-        );
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = dev.tensor([[1, 2, 3], [0, 123, 5]]);
+            let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]);
+            let r = a.gt(&b);
+            assert_eq!(r.array(), [[true, false, true], [true, false, false]]);
+        }
     }
 
     #[test]
     fn test_scalar_gt() {
-        test_scalar_cmp::<TestDtype, 2, 2, _>(
-            [[0.0, 1.2], [3.4, -5.6]],
-            |a| a.gt(1.2).array(),
-            [[false, false], [true, false]],
-        );
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[0.0, 1.2], [3.4, -5.6]])
+            .to_dtype::<TestDtype>();
+        let r = a.gt(1.2);
+        assert_eq!(r.array(), [[false, false], [true, false]]);
     }
 
     #[test]
     fn test_ge() {
-        test_cmp::<TestDtype, 2, 3, _>(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]],
-            [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]],
-            |a, b| a.ge(b).array(),
-            [[true, true, false], [true, true, true]],
-        );
-    }
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.ge(&b);
+        assert_eq!(r.array(), [[true, true, false], [true, true, true]]);
 
-    // TODO Remove this attribute once Cuda supports integers
-    #[cfg(not(feature = "cuda"))]
-    #[test]
-    fn test_ge_not_dtype() {
-        test_cmp(
-            [[1, 2, 3], [0, 123, 5]],
-            [[0, 2, -3], [-4, 123, 6]],
-            |a, b| a.ge(b).array(),
-            [[true, true, true], [true, true, false]],
-        );
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = dev.tensor([[1, 2, 3], [0, 123, 5]]);
+            let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]);
+            let r = a.ge(&b);
+            assert_eq!(r.array(), [[true, true, true], [true, true, false]]);
+        }
     }
 
     #[test]
     fn test_scalar_ge() {
-        test_scalar_cmp::<TestDtype, 2, 2, _>(
-            [[0.0, 1.2], [3.4, -5.6]],
-            |a| a.ge(1.2).array(),
-            [[false, true], [true, false]],
-        );
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[0.0, 1.2], [3.4, -5.6]])
+            .to_dtype::<TestDtype>();
+        let r = a.ge(1.2);
+        assert_eq!(r.array(), [[false, true], [true, false]]);
     }
 
     #[test]
     fn test_lt() {
-        test_cmp::<TestDtype, 2, 3, _>(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]],
-            [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]],
-            |a, b| a.lt(b).array(),
-            [[false, false, true], [false, false, false]],
-        );
-    }
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.lt(&b);
+        assert_eq!(r.array(), [[false, false, true], [false, false, false]]);
 
-    // TODO Remove this attribute once Cuda supports integers
-    #[cfg(not(feature = "cuda"))]
-    #[test]
-    fn test_lt_not_dtype() {
-        test_cmp(
-            [[1, 2, 3], [0, 123, 5]],
-            [[0, 2, -3], [-4, 123, 6]],
-            |a, b| a.lt(b).array(),
-            [[false, false, false], [false, false, true]],
-        );
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = dev.tensor([[1, 2, 3], [0, 123, 5]]);
+            let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]);
+            let r = a.lt(&b);
+            assert_eq!(r.array(), [[false, false, false], [false, false, true]]);
+        }
     }
 
     #[test]
     fn test_scalar_lt() {
-        test_scalar_cmp::<TestDtype, 2, 2, _>(
-            [[0.0, 1.2], [3.4, -5.6]],
-            |a| a.lt(1.2).array(),
-            [[true, false], [false, true]],
-        );
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[0.0, 1.2], [3.4, -5.6]])
+            .to_dtype::<TestDtype>();
+        let r = a.lt(1.2);
+        assert_eq!(r.array(), [[true, false], [false, true]]);
     }
 
     #[test]
     fn test_le() {
-        test_cmp::<TestDtype, 2, 3, _>(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]],
-            [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]],
-            |a, b| a.le(b).array(),
-            [[false, true, true], [false, false, true]],
-        );
-    }
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]])
+            .to_dtype::<TestDtype>();
+        let r = a.le(&b);
+        assert_eq!(r.array(), [[false, true, true], [false, false, true]]);
 
-    // TODO Remove this attribute once Cuda supports integers
-    #[cfg(not(feature = "cuda"))]
-    #[test]
-    fn test_le_not_dtype() {
-        test_cmp(
-            [[1, 2, 3], [0, 123, 5]],
-            [[0, 2, -3], [-4, 123, 6]],
-            |a, b| a.le(b).array(),
-            [[false, true, false], [false, true, true]],
-        );
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = dev.tensor([[1, 2, 3], [0, 123, 5]]);
+            let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]);
+            let r = a.le(&b);
+            assert_eq!(r.array(), [[false, true, false], [false, true, true]]);
+        }
     }
 
     #[test]
     fn test_scalar_le() {
-        test_scalar_cmp::<TestDtype, 2, 2, _>(
-            [[0.0, 1.2], [3.4, -5.6]],
-            |a| a.le(1.2).array(),
-            [[true, true], [false, true]],
-        );
+        let dev: TestDevice = Default::default();
+        let a = dev
+            .tensor([[0.0, 1.2], [3.4, -5.6]])
+            .to_dtype::<TestDtype>();
+        let r = a.le(1.2);
+        assert_eq!(r.array(), [[true, true], [false, true]]);
     }
 
     #[test]
diff --git a/src/tensor_ops/concat/cuda_kernel.rs b/src/tensor_ops/concat/cuda_kernel.rs
index 2dea40dcc..3a278df6c 100644
--- a/src/tensor_ops/concat/cuda_kernel.rs
+++ b/src/tensor_ops/concat/cuda_kernel.rs
@@ -39,6 +39,7 @@ impl<E: Dtype + CudaTypeName> super::ConcatKernel<E> for Cuda {
             let src = BWD_KERNEL.replace("$Ty", E::NAME);
             let opts = CompileOptions {
                 arch: Some(env!("CUDA_COMPUTE_CAP")),
+                include_paths: vec![env!("CUDA_INCLUDE_DIR").to_string()],
                 ..Default::default()
             };
             let ptx = compile_ptx_with_opts(src, opts).unwrap();
@@ -64,6 +65,7 @@ impl<E: Dtype + CudaTypeName> super::ConcatKernel<E> for Cuda {
 }
 
 const BWD_KERNEL: &str = "
+#include \"cuda_fp16.h\"
 extern \"C\" __global__ void concat_bwd(const size_t numel, const $Ty *inp, $Ty *out) {
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < numel) { out[i] += inp[i]; }
diff --git a/src/tensor_ops/conv2d/conv2d.cu b/src/tensor_ops/conv2d/conv2d.cu
index ed48dbc46..16e2ce9dc 100644
--- a/src/tensor_ops/conv2d/conv2d.cu
+++ b/src/tensor_ops/conv2d/conv2d.cu
@@ -1,3 +1,5 @@
+#include "cuda_fp16.h"
+
 struct Conv2DOp {
     size_t stride;
     size_t padding;
@@ -37,11 +39,13 @@ __device__ void unfold_input_into_patches(
     patches += c * (op.kernel * op.kernel * op.h_out * op.w_out);
     patches += b * (op.chan_in * op.kernel * op.kernel * op.h_out * op.w_out);
 
+    T zero = 0.0;
+
     for (int k1 = 0;k1 < op.kernel;k1++) {
         const size_t y = oh * op.stride + k1 - op.padding;
         for (int k2 = 0;k2 < op.kernel;k2++) {
             const size_t x = ow * op.stride + k2 - op.padding;
-            *patches = (y >= op.h_in || x >= op.w_in) ? 0.0 : image[y * strides[2] + x * strides[3]];
+            *patches = (y >= op.h_in || x >= op.w_in) ? zero : image[y * strides[2] + x * strides[3]];
             patches += op.h_out * op.w_out;
         }
     }
@@ -72,6 +76,8 @@ __device__ void unfold_output_into_patches(
     patches += o * (op.kernel * op.kernel * op.h_in * op.w_in);
     patches += b * (op.chan_out * op.kernel * op.kernel * op.h_in * op.w_in);
 
+    T zero = 0.0;
+
     for (int k1 = 0;k1 < op.kernel;k1++) {
         const size_t oh_ks = y + op.padding;
         const size_t oh_s = oh_ks - k1;
@@ -83,7 +89,7 @@ __device__ void unfold_output_into_patches(
             const size_t ow = ow_s / op.stride;
         
             const bool invalid = k1_invalid || (ow_ks < k2 || ow_s % op.stride != 0 || ow >= op.w_out);
-            *patches = invalid ? 0.0 : image_out[oh * op.w_out + ow];
+            *patches = invalid ? zero : image_out[oh * op.w_out + ow];
             patches += op.h_in * op.w_in;
         }
     }
@@ -185,6 +191,13 @@ extern "C" __global__ void SUM_TR_FILTERS( \
     sum_transposed_filters(op, filters_tr, filters, strides); \
 }
 
+CONV_OP(
+    __half,
+    unfold_input_into_patches_f16,
+    unfold_output_into_patches_f16,
+    transpose_filters_f16,
+    sum_transposed_filters_f16
+);
 CONV_OP(
     float,
     unfold_input_into_patches_f32,
diff --git a/src/tensor_ops/conv2d/cuda_kernel.rs b/src/tensor_ops/conv2d/cuda_kernel.rs
index d26ff0fef..d773b3e5e 100644
--- a/src/tensor_ops/conv2d/cuda_kernel.rs
+++ b/src/tensor_ops/conv2d/cuda_kernel.rs
@@ -17,6 +17,17 @@ trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "conv2d_f16";
+    const FNS: &'static [&'static str] = &[
+        "unfold_input_into_patches_f16",
+        "unfold_output_into_patches_f16",
+        "transpose_filters_f16",
+        "sum_transposed_filters_f16",
+    ];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "conv2d_f32";
     const FNS: &'static [&'static str] = &[
diff --git a/src/tensor_ops/conv2d/cudnn_kernel.rs b/src/tensor_ops/conv2d/cudnn_kernel.rs
index 4af2e8309..1f2792eca 100644
--- a/src/tensor_ops/conv2d/cudnn_kernel.rs
+++ b/src/tensor_ops/conv2d/cudnn_kernel.rs
@@ -9,6 +9,8 @@ use crate::{
 use std::sync::Arc;
 
 trait HasCudnnKernel<E> {}
+#[cfg(feature = "f16")]
+impl HasCudnnKernel<half::f16> for Cuda {}
 impl HasCudnnKernel<f32> for Cuda {}
 impl HasCudnnKernel<f64> for Cuda {}
 
diff --git a/src/tensor_ops/conv2d/mod.rs b/src/tensor_ops/conv2d/mod.rs
index dfcb0b434..593eccb52 100644
--- a/src/tensor_ops/conv2d/mod.rs
+++ b/src/tensor_ops/conv2d/mod.rs
@@ -239,15 +239,21 @@ mod tests {
     /// ```
     fn test_conv2d_default_stride_and_padding() {
         let dev: TestDevice = Default::default();
-        let weight: Tensor<_, TestDtype, _> = dev.tensor([
-            [[[-0.04958433, -0.43007267], [0.01935136, 0.09778714]]],
-            [[[0.44083858, -0.20507240], [-0.30017477, -0.10937047]]],
-        ]);
-        let bias: Tensor<_, TestDtype, _> = dev.tensor([0.36406237, -0.30981010]);
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[
-            [-0.86713916, 0.52773184, -0.95238322],
-            [-0.64531374, 0.77809018, -0.49099201],
-        ]]);
+        let weight = dev
+            .tensor([
+                [[[-0.04958433, -0.43007267], [0.01935136, 0.09778714]]],
+                [[[0.44083858, -0.20507240], [-0.30017477, -0.10937047]]],
+            ])
+            .to_dtype::<TestDtype>();
+        let bias = dev
+            .tensor([0.36406237, -0.30981010])
+            .to_dtype::<TestDtype>();
+        let x = dev
+            .tensor([[
+                [-0.86713916, 0.52773184, -0.95238322],
+                [-0.64531374, 0.77809018, -0.49099201],
+            ]])
+            .to_dtype::<TestDtype>();
         let result = x.leaky_trace().conv2d::<1, 0>(weight.clone())
             + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>();
         assert_close_to_literal!(
@@ -281,15 +287,21 @@ mod tests {
     /// ```
     fn test_conv2d_stride_2() {
         let dev: TestDevice = Default::default();
-        let weight: Tensor<_, TestDtype, _> = dev.tensor([
-            [[[0.44704646, -0.29563826], [0.29228759, -0.16575140]]],
-            [[[-0.30488998, 0.25222939], [0.13279295, 0.38153177]]],
-        ]);
-        let bias: Tensor<_, TestDtype, _> = dev.tensor([-0.44699109, 0.38371694]);
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[
-            [0.37100124, -0.59504986, -1.19781005],
-            [-0.31547278, 0.58071911, 0.86612970],
-        ]]);
+        let weight = dev
+            .tensor([
+                [[[0.44704646, -0.29563826], [0.29228759, -0.16575140]]],
+                [[[-0.30488998, 0.25222939], [0.13279295, 0.38153177]]],
+            ])
+            .to_dtype::<TestDtype>();
+        let bias = dev
+            .tensor([-0.44699109, 0.38371694])
+            .to_dtype::<TestDtype>();
+        let x = dev
+            .tensor([[
+                [0.37100124, -0.59504986, -1.19781005],
+                [-0.31547278, 0.58071911, 0.86612970],
+            ]])
+            .to_dtype::<TestDtype>();
 
         let result = x.leaky_trace().conv2d::<2, 0>(weight.clone())
             + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>();
@@ -317,14 +329,17 @@ mod tests {
     fn test_conv2d_padding_1() {
         let dev: TestDevice = Default::default();
         #[rustfmt::skip]
-        let weight: Tensor<_, TestDtype, _> = dev.tensor([
+        let weight = dev.tensor([
             [[[0.10215953, 0.06263646], [-0.04124039, -0.09729567]], [[-0.32656857, 0.24254093], [-0.27209827, 0.15361503]]],
             [[[0.03449896, 0.22931078], [-0.17652659, 0.08222872]],[[-0.06016779, 0.29082409], [-0.19154115, 0.13483226]]],
             [[[-0.14262493, 0.19654515], [0.15921101, 0.01759464]],[[0.16749159, 0.33096817], [0.28376505, -0.05524009]]],
-        ]);
-        let bias: Tensor<_, TestDtype, _> = dev.tensor([-0.22854491, 0.28763595, 0.20709404]);
-        let x: Tensor<_, TestDtype, _> =
-            dev.tensor([[[-0.32224107, -0.32800716]], [[-1.13570976, 0.93713200]]]);
+        ]).to_dtype::<TestDtype>();
+        let bias = dev
+            .tensor([-0.22854491, 0.28763595, 0.20709404])
+            .to_dtype::<TestDtype>();
+        let x = dev
+            .tensor([[[-0.32224107, -0.32800716]], [[-1.13570976, 0.93713200]]])
+            .to_dtype::<TestDtype>();
 
         let result = x.leaky_trace().conv2d::<1, 1>(weight.clone())
             + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>();
@@ -363,13 +378,15 @@ mod tests {
     fn test_conv2d_stride_3_padding_4() {
         let dev: TestDevice = Default::default();
         #[rustfmt::skip]
-        let weight: Tensor<_, TestDtype, _> = dev.tensor([
+        let weight = dev.tensor([
             [[[-0.10252278, -0.14387409, -0.14627469],[0.28396228, -0.14590892, 0.29269591],[0.01090384, 0.14785287, 0.29242596]]],
             [[[-0.31163597, 0.13224581, -0.20954299],[0.27902845, -0.14735751, 0.14001134],[-0.05224654, 0.16499066, -0.13981307]]],
-        ]);
-        let bias: Tensor<_, TestDtype, _> = dev.tensor([-0.07123789, -0.17244765]);
+        ]).to_dtype::<TestDtype>();
+        let bias = dev
+            .tensor([-0.07123789, -0.17244765])
+            .to_dtype::<TestDtype>();
         #[rustfmt::skip]
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[[0.69103152, 0.25624934],[-0.38448590, 0.03110456],[0.83753252, 0.53786588],[1.15540242, -0.54148245]]]);
+        let x = dev.tensor([[[0.69103152, 0.25624934],[-0.38448590, 0.03110456],[0.83753252, 0.53786588],[1.15540242, -0.54148245]]]).to_dtype::<TestDtype>();
 
         let result = x.leaky_trace().conv2d::<3, 4>(weight.clone())
             + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>();
diff --git a/src/tensor_ops/convtrans2d/convtrans2d.cu b/src/tensor_ops/convtrans2d/convtrans2d.cu
index 0004a90b8..d6e842704 100644
--- a/src/tensor_ops/convtrans2d/convtrans2d.cu
+++ b/src/tensor_ops/convtrans2d/convtrans2d.cu
@@ -1,3 +1,5 @@
+#include "cuda_fp16.h"
+
 struct Conv2DOp {
     size_t stride;
     size_t padding;
@@ -38,6 +40,8 @@ __device__ void unfold_input_into_patches(
     patches += c * (op.kernel * op.kernel * op.h_out * op.w_out);
     patches += b * (op.chan_in * op.kernel * op.kernel * op.h_out * op.w_out);
 
+    T zero = 0.0;
+
     for (int k1 = 0;k1 < op.kernel;k1++) {
         const size_t y_ks = oh + op.padding;
         const size_t y_s = y_ks - k1;
@@ -49,7 +53,7 @@ __device__ void unfold_input_into_patches(
             const size_t x = x_s / op.stride;
         
             const bool invalid = k1_invalid || (x_ks < k2 || x_s % op.stride != 0 || x >= op.w_in);
-            *patches = invalid ? 0.0 : image[y * strides[2] + x * strides[3]];
+            *patches = invalid ? zero : image[y * strides[2] + x * strides[3]];
             patches += op.h_out * op.w_out;
         }
     }
@@ -80,11 +84,13 @@ __device__ void unfold_output_into_patches(
     patches += o * (op.kernel * op.kernel * op.h_in * op.w_in);
     patches += b * (op.chan_out * op.kernel * op.kernel * op.h_in * op.w_in);
 
+    T zero = 0.0;
+
     for (int k1 = 0;k1 < op.kernel;k1++) {
         const size_t oh = y * op.stride + k1 - op.padding;
         for (int k2 = 0;k2 < op.kernel;k2++) {
             const size_t ow = x * op.stride + k2 - op.padding;
-            *patches = (oh >= op.h_out || ow >= op.w_out) ? 0.0 : image_out[oh * op.w_out + ow];
+            *patches = (oh >= op.h_out || ow >= op.w_out) ? zero : image_out[oh * op.w_out + ow];
             patches += op.h_in * op.w_in;
         }
     }
@@ -186,6 +192,13 @@ extern "C" __global__ void SUM_TR_FILTERS( \
     sum_transposed_filters(op, filters_tr, filters, strides); \
 }
 
+CONV_OP(
+    __half,
+    unfold_input_into_patches_f16,
+    unfold_output_into_patches_f16,
+    transpose_filters_f16,
+    sum_transposed_filters_f16
+);
 CONV_OP(
     float,
     unfold_input_into_patches_f32,
diff --git a/src/tensor_ops/convtrans2d/cuda_kernel.rs b/src/tensor_ops/convtrans2d/cuda_kernel.rs
index 21d88e9d8..9e30d4afc 100644
--- a/src/tensor_ops/convtrans2d/cuda_kernel.rs
+++ b/src/tensor_ops/convtrans2d/cuda_kernel.rs
@@ -17,6 +17,17 @@ trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "convtrans2d_f16";
+    const FNS: &'static [&'static str] = &[
+        "unfold_input_into_patches_f16",
+        "unfold_output_into_patches_f16",
+        "transpose_filters_f16",
+        "sum_transposed_filters_f16",
+    ];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "convtrans2d_f32";
     const FNS: &'static [&'static str] = &[
diff --git a/src/tensor_ops/cos/cos.cu b/src/tensor_ops/cos/cos.cu
index 53d891539..25f427391 100644
--- a/src/tensor_ops/cos/cos.cu
+++ b/src/tensor_ops/cos/cos.cu
@@ -2,11 +2,14 @@
 
 struct CosKernelOp {};
 
+UNARY_OP(__half, cos_fwd_f16, cos_bwd_f16, CosKernelOp,
+        cosg(x),
+        -sing(x))
+
 UNARY_OP(float, cos_fwd_f32, cos_bwd_f32, CosKernelOp,
-        cosf(x),
-        -sinf(x))
+        cosg(x),
+        -sing(x))
 
 UNARY_OP(double, cos_fwd_f64, cos_bwd_f64, CosKernelOp,
-        cos(x),
-        -sin(x))
-        
\ No newline at end of file
+        cosg(x),
+        -sing(x))
diff --git a/src/tensor_ops/cos/cuda_kernel.rs b/src/tensor_ops/cos/cuda_kernel.rs
index 2c6bd6874..904ba10ef 100644
--- a/src/tensor_ops/cos/cuda_kernel.rs
+++ b/src/tensor_ops/cos/cuda_kernel.rs
@@ -4,5 +4,13 @@ unsafe impl cudarc::driver::DeviceRepr for super::CosKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/cos.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    super::CosKernelOp,
+    half::f16,
+    PTX,
+    "cos_fwd_f16",
+    "cos_bwd_f16"
+);
 cuda_unary!(super::CosKernelOp, f32, PTX, "cos_fwd_f32", "cos_bwd_f32");
 cuda_unary!(super::CosKernelOp, f64, PTX, "cos_fwd_f64", "cos_bwd_f64");
diff --git a/src/tensor_ops/cos/mod.rs b/src/tensor_ops/cos/mod.rs
index b9d13d90d..1a6ea4a53 100644
--- a/src/tensor_ops/cos/mod.rs
+++ b/src/tensor_ops/cos/mod.rs
@@ -45,7 +45,9 @@ mod tests {
     #[test]
     fn test_cos() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().cos();
         assert_close_to_literal!(r, [-0.41614684, 0.5403023, 1.0, 0.5403023, -0.41614684]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/div/binary_div.cu b/src/tensor_ops/div/binary_div.cu
index c287ddc60..3945dba51 100644
--- a/src/tensor_ops/div/binary_div.cu
+++ b/src/tensor_ops/div/binary_div.cu
@@ -2,13 +2,18 @@
 
 struct BinaryDivOp {};
 
+BINARY_OP(__half, bdiv_fwd_f16, bdiv_bwd_lhs_f16, bdiv_bwd_rhs_f16, BinaryDivOp,
+    x / y,
+    recipg(y),
+    -x / (y * y))
+
 BINARY_OP(float, bdiv_fwd_f32, bdiv_bwd_lhs_f32, bdiv_bwd_rhs_f32, BinaryDivOp,
     x / y,
-    1.0 / y,
+    recipg(y),
     -x / (y * y))
 
 BINARY_OP(double, bdiv_fwd_f64, bdiv_bwd_lhs_f64, bdiv_bwd_rhs_f64, BinaryDivOp,
     x / y,
-    1.0 / y,
+    recipg(y),
     -x / (y * y))
    
diff --git a/src/tensor_ops/div/cuda_kernel.rs b/src/tensor_ops/div/cuda_kernel.rs
index 5653b6942..48407719c 100644
--- a/src/tensor_ops/div/cuda_kernel.rs
+++ b/src/tensor_ops/div/cuda_kernel.rs
@@ -1,6 +1,8 @@
 use super::{BinaryDivKernelOp as Binary, ScalarDivKernelOp as Scalar};
 use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary};
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for Scalar<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f64> {}
 unsafe impl cudarc::driver::DeviceRepr for Binary {}
@@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {}
 const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_div.ptx"));
 const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_div.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(const_df() Scalar<half::f16>, half::f16, SCALAR_PTX, "sdiv_fwd_f16", "sdiv_bwd_f16");
 cuda_unary!(const_df() Scalar<f32>, f32, SCALAR_PTX, "sdiv_fwd_f32", "sdiv_bwd_f32");
 cuda_unary!(const_df() Scalar<f64>, f64, SCALAR_PTX, "sdiv_fwd_f64", "sdiv_bwd_f64");
+#[cfg(feature = "f16")]
+cuda_binary!(
+    Binary,
+    half::f16,
+    BINARY_PTX,
+    "bdiv_fwd_f16",
+    "bdiv_bwd_lhs_f16",
+    "bdiv_bwd_rhs_f16"
+);
 cuda_binary!(
     Binary,
     f32,
diff --git a/src/tensor_ops/div/mod.rs b/src/tensor_ops/div/mod.rs
index f7abb2ead..53baa75dd 100644
--- a/src/tensor_ops/div/mod.rs
+++ b/src/tensor_ops/div/mod.rs
@@ -72,6 +72,17 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ScalarDivKernelOp<E>, E>, T: Tape<E, D>>
     }
 }
 
+#[cfg(feature = "f16")]
+impl<S: Shape, D: UnaryKernel<ScalarDivKernelOp<half::f16>, half::f16>, T: Tape<half::f16, D>>
+    TryDiv<f32> for Tensor<S, half::f16, D, T>
+{
+    /// See [div]
+    fn try_div(self, rhs: f32) -> Result<Self, Self::Err> {
+        let scalar = half::f16::from_f32(rhs);
+        try_unary_op(ScalarDivKernelOp { scalar }, self)
+    }
+}
+
 impl<S: Shape, E: Dtype, D: DeviceStorage, LhsTape: Tape<E, D>, Rhs> std::ops::Div<Rhs>
     for Tensor<S, E, D, LhsTape>
 where
@@ -94,8 +105,8 @@ mod tests {
     fn test_div_0d() {
         let dev: TestDevice = Default::default();
 
-        let a: Tensor<_, TestDtype, _> = dev.tensor(2.0);
-        let b: Tensor<_, TestDtype, _> = dev.tensor(4.0);
+        let a = dev.tensor(2.0).to_dtype::<TestDtype>();
+        let b = dev.tensor(4.0).to_dtype::<TestDtype>();
 
         let r = b.leaky_trace() / a.clone();
         assert_close_to_literal!(r, 2.0);
@@ -107,8 +118,8 @@ mod tests {
     #[test]
     fn test_div_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]);
+        let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
+        let b = dev.tensor([1.0, -1.0, 0.0]).to_dtype::<TestDtype>();
 
         let r = b.leaky_trace() / a.clone();
         assert_close_to_literal!(r, [1.0, -0.5, 0.0]);
@@ -120,10 +131,12 @@ mod tests {
     #[test]
     fn test_div_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]);
+        let a = dev
+            .tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]])
+            .to_dtype::<TestDtype>();
 
         let r = b.leaky_trace() / a.clone();
         assert_close_to_literal!(
@@ -137,15 +150,15 @@ mod tests {
         assert_close_to_literal!(
             g.get(&a),
             [
-                [-0.20074181, -2.1961217, -2.7844446],
-                [-0.42998204, -0.12488105, -0.009292662],
+                [-0.20074183, -2.19612169, -2.78444433],
+                [-0.42998207, -0.12488105, -0.00929266]
             ]
         );
         assert_close_to_literal!(
             g.get(&b),
             &[
-                [0.25367835, 0.97580016, 1.1111112],
-                [0.29456818, 0.2377556, 0.1997922],
+                [0.25367835, 0.97580016, 1.11111104],
+                [0.29456815, 0.23775560, 0.19979222]
             ]
         );
     }
@@ -153,7 +166,7 @@ mod tests {
     #[test]
     fn test_scalar_div_0d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor(1.0);
+        let x = dev.tensor(1.0).to_dtype::<TestDtype>();
         let r = x.leaky_trace() / 2.0;
         assert_close_to_literal!(r, 0.5);
         let g = r.exp().backward();
@@ -163,7 +176,7 @@ mod tests {
     #[test]
     fn test_scalar_div_1d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]);
+        let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() / 2.0;
         assert_close_to_literal!(r, [0.0, 0.5, 1.0]);
         let g = r.exp().sum().backward();
@@ -173,7 +186,7 @@ mod tests {
     #[test]
     fn test_scalar_div_2d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0; 2]; 3]);
+        let x = dev.tensor([[1.0; 2]; 3]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() / 2.0;
         assert_close_to_literal!(r, [[0.5; 2]; 3]);
         let g = r.exp().sum().backward();
diff --git a/src/tensor_ops/div/scalar_div.cu b/src/tensor_ops/div/scalar_div.cu
index 0c4a6dca3..ff8eebfd2 100644
--- a/src/tensor_ops/div/scalar_div.cu
+++ b/src/tensor_ops/div/scalar_div.cu
@@ -5,10 +5,14 @@ struct ScalarDivKernelOp {
     T scalar;
 };
 
+UNARY_OP(__half, sdiv_fwd_f16, sdiv_bwd_f16, ScalarDivKernelOp<__half>,
+    x / op.scalar,
+    recipg(op.scalar));
+
 UNARY_OP(float, sdiv_fwd_f32, sdiv_bwd_f32, ScalarDivKernelOp<float>,
     x / op.scalar,
-    1.0 / op.scalar);
+    recipg(op.scalar));
 
 UNARY_OP(double, sdiv_fwd_f64, sdiv_bwd_f64, ScalarDivKernelOp<double>,
     x / op.scalar,
-    1.0 / op.scalar);
+    recipg(op.scalar));
diff --git a/src/tensor_ops/dropout/cuda_kernel.rs b/src/tensor_ops/dropout/cuda_kernel.rs
index 8e0fd0d9b..0b10bf12b 100644
--- a/src/tensor_ops/dropout/cuda_kernel.rs
+++ b/src/tensor_ops/dropout/cuda_kernel.rs
@@ -17,6 +17,12 @@ trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "dropout_f16";
+    const FNS: &'static [&'static str] = &["dropout_fwd_f16", "dropout_bwd_f16"];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "dropout_f32";
     const FNS: &'static [&'static str] = &["dropout_fwd_f32", "dropout_bwd_f32"];
diff --git a/src/tensor_ops/dropout/dropout.cu b/src/tensor_ops/dropout/dropout.cu
index 638c28b97..a52617f44 100644
--- a/src/tensor_ops/dropout/dropout.cu
+++ b/src/tensor_ops/dropout/dropout.cu
@@ -1,3 +1,5 @@
+#include "cuda_fp16.h"
+
 #define DROPOUT(TYPENAME, FWD, BWD) \
 extern "C" __global__ void FWD( \
     const TYPENAME prob, \
@@ -10,7 +12,9 @@ extern "C" __global__ void FWD( \
     if (i >= numel) { \
         return; \
     } \
-    auto scalar = (noise[i] < prob) ? 0.0 : (1.0 / (1.0 - prob)); \
+    TYPENAME zero = 0.0; \
+    TYPENAME one = 1.0; \
+    TYPENAME scalar = (noise[i] < prob) ? zero : (one / (one - prob)); \
     out[i] = inp[i] * scalar; \
 } \
 extern "C" __global__ void BWD( \
@@ -24,8 +28,11 @@ extern "C" __global__ void BWD( \
     if (i >= numel) { \
         return; \
     } \
-    grad_inp[i] += (noise[i] < prob) ? 0.0 : (grad_out[i] / (1.0 - prob)); \
+    TYPENAME zero = 0.0; \
+    TYPENAME one = 1.0; \
+    grad_inp[i] += (noise[i] < prob) ? zero : (grad_out[i] / (one - prob)); \
 }
 
+DROPOUT(__half, dropout_fwd_f16, dropout_bwd_f16);
 DROPOUT(float, dropout_fwd_f32, dropout_bwd_f32);
 DROPOUT(double, dropout_fwd_f64, dropout_bwd_f64);
diff --git a/src/tensor_ops/dropout/mod.rs b/src/tensor_ops/dropout/mod.rs
index 890ac3282..3a00406a2 100644
--- a/src/tensor_ops/dropout/mod.rs
+++ b/src/tensor_ops/dropout/mod.rs
@@ -51,20 +51,20 @@ pub trait DropoutKernel<E: Dtype>: DeviceStorage {
 /// random numbers, so the masking is the same for both.
 pub fn dropout<S: Shape, E: Dtype, D: DropoutKernel<E>, T: Tape<E, D>>(
     t: Tensor<S, E, D, T>,
-    prob: impl Into<E>,
+    prob: impl Into<f64>,
 ) -> Tensor<S, E, D, T> {
     t.dropout(prob)
 }
 
 impl<S: Shape, E: Dtype, D: DropoutKernel<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [dropout]
-    pub fn dropout(self, prob: impl Into<E>) -> Self {
+    pub fn dropout(self, prob: impl Into<f64>) -> Self {
         self.try_dropout(prob).unwrap()
     }
     /// See [dropout]
-    pub fn try_dropout(self, prob: impl Into<E>) -> Result<Self, D::Err> {
+    pub fn try_dropout(self, prob: impl Into<f64>) -> Result<Self, D::Err> {
         let seed = self.device.random_u64();
-        let prob = prob.into();
+        let prob = E::from_f64(prob.into()).unwrap();
         let op = DropoutKernelOp { seed, prob };
         let (inp, mut tape) = self.split_tape();
         let out = inp.device.forward(op, &inp)?;
diff --git a/src/tensor_ops/exp/cuda_kernel.rs b/src/tensor_ops/exp/cuda_kernel.rs
index 13cef0555..c2082962f 100644
--- a/src/tensor_ops/exp/cuda_kernel.rs
+++ b/src/tensor_ops/exp/cuda_kernel.rs
@@ -4,5 +4,7 @@ unsafe impl cudarc::driver::DeviceRepr for super::ExpKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/exp.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(df(f(x)) super::ExpKernelOp, half::f16, PTX, "exp_fwd_f16", "exp_bwd_f16");
 cuda_unary!(df(f(x)) super::ExpKernelOp, f32, PTX, "exp_fwd_f32", "exp_bwd_f32");
 cuda_unary!(df(f(x)) super::ExpKernelOp, f64, PTX, "exp_fwd_f64", "exp_bwd_f64");
diff --git a/src/tensor_ops/exp/exp.cu b/src/tensor_ops/exp/exp.cu
index a79293cfe..5f82b5628 100644
--- a/src/tensor_ops/exp/exp.cu
+++ b/src/tensor_ops/exp/exp.cu
@@ -2,11 +2,15 @@
 
 struct ExpKernelOp {};
 
+UNARY_OP(__half, exp_fwd_f16, exp_bwd_f16, ExpKernelOp,
+        expg(x),
+        y)
+
 UNARY_OP(float, exp_fwd_f32, exp_bwd_f32, ExpKernelOp,
-        expf(x),
+        expg(x),
         y)
 
 UNARY_OP(double, exp_fwd_f64, exp_bwd_f64, ExpKernelOp,
-        exp(x),
+        expg(x),
         y)
         
\ No newline at end of file
diff --git a/src/tensor_ops/exp/mod.rs b/src/tensor_ops/exp/mod.rs
index a8c762a96..e8c7abe70 100644
--- a/src/tensor_ops/exp/mod.rs
+++ b/src/tensor_ops/exp/mod.rs
@@ -45,7 +45,9 @@ mod tests {
     #[test]
     fn test_exp() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().exp();
         assert_close_to_literal!(r, [0.13533528, 0.36787945, 1.0, f64::exp(1.0), 7.389056]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/gelu/cuda_kernel.rs b/src/tensor_ops/gelu/cuda_kernel.rs
index 9e010553a..8982e6702 100644
--- a/src/tensor_ops/gelu/cuda_kernel.rs
+++ b/src/tensor_ops/gelu/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for super::GeLUKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/gelu.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(GeLUKernelOp, half::f16, PTX, "gelu_fwd_f16", "gelu_bwd_f16");
 cuda_unary!(GeLUKernelOp, f32, PTX, "gelu_fwd_f32", "gelu_bwd_f32");
 cuda_unary!(GeLUKernelOp, f64, PTX, "gelu_fwd_f64", "gelu_bwd_f64");
diff --git a/src/tensor_ops/gelu/gelu.cu b/src/tensor_ops/gelu/gelu.cu
index 03843238b..bc3e220e9 100644
--- a/src/tensor_ops/gelu/gelu.cu
+++ b/src/tensor_ops/gelu/gelu.cu
@@ -1,5 +1,4 @@
 #include "unary_op_macros.cuh"
-#include "cuda_utils.cuh"
 #define _USE_MATH_DEFINES
 #include <math.h>
 
@@ -7,33 +6,44 @@ struct GeLUKernelOp {};
 
 template<typename T>
 __device__ T gelu_fwd(T x) {
-    constexpr T fastCoeff = 0.044715;
+    T fastCoeff = 0.044715;
+    T one = 1.0;
+    T half = 0.5;
+    T beta = M_2_SQRTPI * M_SQRT1_2;
     T x_sq = x * x;
     T x_cube = x_sq * x;
     T alpha = x + fastCoeff * x_cube;
-    return 0.5 * x * (1.0 + tanhg(M_2_SQRTPI * M_SQRT1_2 * alpha));
+    return half * x * (one + tanhg(beta * alpha));
 }
 
 template<typename T>
 __device__ T gelu_bwd(T x) {
-    constexpr T kBeta = M_2_SQRTPI * M_SQRT2 * 0.5;                       
-    constexpr T fastCoeff = 0.044715;
+    T one = 1.0;
+    T three = 3.0;
+    T half = 0.5;
+    T fastCoeff = 0.044715;
+    T kBeta = M_2_SQRTPI * M_SQRT2 * 0.5;
     T x_sq = x * x;
     T x_cube = x_sq * x;
     T inner = kBeta * (x + fastCoeff * x_cube);
     T tanh_inner = tanhg(inner);
 
-    T left = 0.5 * x;
-    T right = 1.0 + tanh_inner;
+    T left = half * x;
+    T right = one + tanh_inner;
     
-    T left_derivative = 0.5 * right;
+    T left_derivative = half * right;
 
-    T tanh_derivative = 1.0 - tanh_inner * tanh_inner;
-    T inner_derivative = kBeta * (1.0 + 3.0 * fastCoeff * x_sq);
+    T tanh_derivative = one - tanh_inner * tanh_inner;
+    T inner_derivative = kBeta * (one + three * fastCoeff * x_sq);
     T right_derivative = left * tanh_derivative * inner_derivative;
     return left_derivative + right_derivative;
 }
 
+UNARY_OP(__half, gelu_fwd_f16, gelu_bwd_f16, GeLUKernelOp,
+    gelu_fwd(x),
+    gelu_bwd(x)
+)
+
 UNARY_OP(float, gelu_fwd_f32, gelu_bwd_f32, GeLUKernelOp,
     gelu_fwd(x),
     gelu_bwd(x)
diff --git a/src/tensor_ops/gelu/mod.rs b/src/tensor_ops/gelu/mod.rs
index 98e208a44..0abd95e8b 100644
--- a/src/tensor_ops/gelu/mod.rs
+++ b/src/tensor_ops/gelu/mod.rs
@@ -43,7 +43,9 @@ mod tests {
     #[test]
     fn test_gelu() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().gelu();
         assert_close_to_literal!(r, [-0.04540229, -0.158808, 0.0, 0.841192, 1.9545977]);
         // NOTE: call .exp() to make sure we cover cases where .gelu() uses the result's gradient
diff --git a/src/tensor_ops/huber_error/cuda_kernel.rs b/src/tensor_ops/huber_error/cuda_kernel.rs
index 1b9dce952..6c936c2be 100644
--- a/src/tensor_ops/huber_error/cuda_kernel.rs
+++ b/src/tensor_ops/huber_error/cuda_kernel.rs
@@ -1,11 +1,22 @@
 use super::HuberErrorKernelOp as HuberError;
 use crate::tensor_ops::cuda_kernels::cuda_binary;
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for HuberError<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for HuberError<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for HuberError<f64> {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/huber_error.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_binary!(
+    HuberError<half::f16>,
+    half::f16,
+    PTX,
+    "huber_fwd_f16",
+    "huber_bwd_lhs_f16",
+    "huber_bwd_rhs_f16"
+);
 cuda_binary!(
     HuberError<f32>,
     f32,
diff --git a/src/tensor_ops/huber_error/huber_error.cu b/src/tensor_ops/huber_error/huber_error.cu
index 39c90b55a..124b56ae0 100644
--- a/src/tensor_ops/huber_error/huber_error.cu
+++ b/src/tensor_ops/huber_error/huber_error.cu
@@ -7,19 +7,21 @@ struct HuberErrorOp {
 
 template<typename T>
 __device__ T op_f(HuberErrorOp<T> op, T x, T y) {
-    auto a = x - y;
+    T a = x - y;
+    T half = 0.5;
     if (absg(a) < op.delta) {
-        return a * a * 0.5;
+        return a * a * half;
     } else {
-        return op.delta * (absg(a) - 0.5 * op.delta);
+        return op.delta * (absg(a) - half * op.delta);
     }
 }
 
 template<typename T>
 __device__ T op_dfdx(HuberErrorOp<T> op, T x, T y) {
-    auto a = x - y;
-    if (a == 0.0) {
-        return 0.0;
+    T a = x - y;
+    T zero = 0.0;
+    if (a == zero) {
+        return zero;
     } else if (absg(a) < op.delta) {
         return a;
     } else {
@@ -32,6 +34,12 @@ __device__ T op_dfdy(HuberErrorOp<T> op, T x, T y) {
     return -op_dfdx(op, x, y);
 }
 
+BINARY_OP(__half, huber_fwd_f16, huber_bwd_lhs_f16, huber_bwd_rhs_f16, HuberErrorOp<__half>,
+    op_f(op, x, y),
+    op_dfdx(op, x, y),
+    op_dfdy(op, x, y)
+)
+
 BINARY_OP(float, huber_fwd_f32, huber_bwd_lhs_f32, huber_bwd_rhs_f32, HuberErrorOp<float>,
     op_f(op, x, y),
     op_dfdx(op, x, y),
diff --git a/src/tensor_ops/huber_error/mod.rs b/src/tensor_ops/huber_error/mod.rs
index c45a60f69..335c28e29 100644
--- a/src/tensor_ops/huber_error/mod.rs
+++ b/src/tensor_ops/huber_error/mod.rs
@@ -31,14 +31,14 @@ pub struct HuberErrorKernelOp<E> {
 pub fn huber_error<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D> + Merge<R>, R: Tape<E, D>>(
     lhs: Tensor<S, E, D, T>,
     rhs: Tensor<S, E, D, R>,
-    delta: impl Into<E>,
+    delta: impl Into<f64>,
 ) -> Tensor<S, E, D, T> {
     lhs.huber_error(rhs, delta)
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [huber_error]
-    pub fn huber_error<R: Tape<E, D>>(self, rhs: Tensor<S, E, D, R>, delta: impl Into<E>) -> Self
+    pub fn huber_error<R: Tape<E, D>>(self, rhs: Tensor<S, E, D, R>, delta: impl Into<f64>) -> Self
     where
         T: Merge<R>,
     {
@@ -49,12 +49,12 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     pub fn try_huber_error<R: Tape<E, D>>(
         self,
         rhs: Tensor<S, E, D, R>,
-        delta: impl Into<E>,
+        delta: impl Into<f64>,
     ) -> Result<Self, D::Err>
     where
         T: Merge<R>,
     {
-        let delta = delta.into();
+        let delta = E::from_f64(delta.into()).unwrap();
         try_binary_op(HuberErrorKernelOp { delta }, self, rhs)
     }
 }
@@ -66,14 +66,18 @@ mod tests {
     #[test]
     fn test_huber_error() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([
-            [-0.8424031, 0.6309481, 1.0416432],
-            [1.325225, 0.5840275, 1.9167633],
-        ]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.52022195, 0.578804, 0.17535722],
-            [0.75429636, 0.66566986, 0.6182751],
-        ]);
+        let a = dev
+            .tensor([
+                [-0.8424031, 0.6309481, 1.0416432],
+                [1.325225, 0.5840275, 1.9167633],
+            ])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([
+                [0.52022195, 0.578804, 0.17535722],
+                [0.75429636, 0.66566986, 0.6182751],
+            ])
+            .to_dtype::<TestDtype>();
         let r1 = a.leaky_trace().huber_error(b.leaky_trace(), 1.0);
         let r2 = a.leaky_trace().huber_error(b.leaky_trace(), 100.0);
         assert_close_to_literal!(
diff --git a/src/tensor_ops/ln/cuda_kernel.rs b/src/tensor_ops/ln/cuda_kernel.rs
index 535bbef3f..33a15186b 100644
--- a/src/tensor_ops/ln/cuda_kernel.rs
+++ b/src/tensor_ops/ln/cuda_kernel.rs
@@ -4,5 +4,13 @@ unsafe impl cudarc::driver::DeviceRepr for super::LnKernelOp {}
 
 const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/ln.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    super::LnKernelOp,
+    half::f16,
+    PTX_SRC,
+    "ln_fwd_f16",
+    "ln_bwd_f16"
+);
 cuda_unary!(super::LnKernelOp, f32, PTX_SRC, "ln_fwd_f32", "ln_bwd_f32");
 cuda_unary!(super::LnKernelOp, f64, PTX_SRC, "ln_fwd_f64", "ln_bwd_f64");
diff --git a/src/tensor_ops/ln/ln.cu b/src/tensor_ops/ln/ln.cu
index 47e1ae910..8f6dcf278 100644
--- a/src/tensor_ops/ln/ln.cu
+++ b/src/tensor_ops/ln/ln.cu
@@ -2,11 +2,15 @@
 
 struct LnKernelOp {};
 
+UNARY_OP(__half, ln_fwd_f16, ln_bwd_f16, LnKernelOp,
+        logg(x),
+        recipg(x))
+
 UNARY_OP(float, ln_fwd_f32, ln_bwd_f32, LnKernelOp,
-        logf(x),
-        1.0 / x)
+        logg(x),
+        recipg(x))
 
 UNARY_OP(double, ln_fwd_f64, ln_bwd_f64, LnKernelOp,
-        log(x),
-        1.0 / x)
+        logg(x),
+        recipg(x))
         
\ No newline at end of file
diff --git a/src/tensor_ops/ln/mod.rs b/src/tensor_ops/ln/mod.rs
index 6dcaedfe6..93f8d9bd4 100644
--- a/src/tensor_ops/ln/mod.rs
+++ b/src/tensor_ops/ln/mod.rs
@@ -45,12 +45,17 @@ mod tests {
     #[test]
     fn test_ln() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().ln();
         let r_array = r.array();
         assert!(r_array[0].is_nan());
         assert!(r_array[1].is_nan());
-        assert!(r_array[2..] == [TestDtype::NEG_INFINITY, 0.0, TestDtype::ln(2.0)]);
+        assert!(r_array[2].is_infinite() && r_array[2].is_sign_negative());
+        assert_eq!(r_array[3], TestDtype::default());
+        let t: TestDtype = NumCast::from(2.0f64.ln()).unwrap();
+        assert_eq!(r_array[4], t);
         let g = r.mean().backward();
         assert_close_to_literal!(g.get(&x), [-0.1, -0.2, f64::INFINITY, 0.2, 0.1]);
     }
diff --git a/src/tensor_ops/log_softmax.rs b/src/tensor_ops/log_softmax.rs
index 59557c654..fda0c9d73 100644
--- a/src/tensor_ops/log_softmax.rs
+++ b/src/tensor_ops/log_softmax.rs
@@ -103,7 +103,9 @@ mod tests {
     #[test]
     fn test_log_softmax_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let a = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().log_softmax();
         assert_close_to_literal!(
             r,
@@ -125,7 +127,9 @@ mod tests {
     #[test]
     fn test_log_softmax_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]);
+        let a = dev
+            .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().log_softmax::<Axis<1>>();
         assert_close_to_literal!(
             r,
diff --git a/src/tensor_ops/logsumexp_to.rs b/src/tensor_ops/logsumexp_to.rs
index a5e54ab3c..a188070c8 100644
--- a/src/tensor_ops/logsumexp_to.rs
+++ b/src/tensor_ops/logsumexp_to.rs
@@ -73,7 +73,9 @@ mod tests {
     #[test]
     fn test_logsumexp_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let a = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().logsumexp();
         assert_close_to_literal!(r, 2.4519143);
         let g = r.backward();
@@ -86,7 +88,9 @@ mod tests {
     #[test]
     fn test_logsumexp_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]);
+        let a = dev
+            .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().logsumexp::<Rank1<2>, _>();
         assert_close_to_literal!(r, [0.40760595, 7.0509458]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/matmul/cpu_kernel.rs b/src/tensor_ops/matmul/cpu_kernel.rs
index 4a08face8..d2faa19de 100644
--- a/src/tensor_ops/matmul/cpu_kernel.rs
+++ b/src/tensor_ops/matmul/cpu_kernel.rs
@@ -17,12 +17,8 @@ use cblas_sys::{
 ))]
 use matrixmultiply::{dgemm, sgemm};
 
-#[cfg(not(any(
-    feature = "cpu-seq-matmul",
-    feature = "cpu-par-matmul",
-    feature = "cpu-mkl-matmul"
-)))]
-fn gemm<F: num_traits::Float + std::ops::AddAssign, M: Dim, K: Dim, N: Dim>(
+#[allow(unused)]
+fn naive_gemm<F: num_traits::Float + std::ops::AddAssign, M: Dim, K: Dim, N: Dim>(
     (m, k, n): (M, K, N),
     ap: *const F,
     a_strides: [usize; 2],
@@ -57,6 +53,22 @@ pub(crate) trait MatMulImpl<E> {
     );
 }
 
+#[cfg(feature = "f16")]
+impl MatMulImpl<half::f16> for Cpu {
+    #[inline]
+    fn matmul<M: Dim, K: Dim, N: Dim>(
+        (m, k, n): (M, K, N),
+        ap: *const half::f16,
+        a_strides: [usize; 2],
+        bp: *const half::f16,
+        b_strides: [usize; 2],
+        cp: *mut half::f16,
+        c_strides: [usize; 2],
+    ) {
+        naive_gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides);
+    }
+}
+
 impl MatMulImpl<f32> for Cpu {
     #[inline]
     fn matmul<M: Dim, K: Dim, N: Dim>(
@@ -103,7 +115,7 @@ impl MatMulImpl<f32> for Cpu {
             feature = "cpu-par-matmul",
             feature = "cpu-mkl-matmul"
         )))]
-        gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides);
+        naive_gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides);
     }
 }
 
@@ -156,7 +168,7 @@ impl MatMulImpl<f64> for Cpu {
             feature = "cpu-par-matmul",
             feature = "cpu-mkl-matmul"
         )))]
-        gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides);
+        naive_gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides);
     }
 }
 
diff --git a/src/tensor_ops/matmul/mod.rs b/src/tensor_ops/matmul/mod.rs
index a59f3a0d3..954857b4f 100644
--- a/src/tensor_ops/matmul/mod.rs
+++ b/src/tensor_ops/matmul/mod.rs
@@ -365,14 +365,17 @@ mod tests {
     fn test_matmul_normal() {
         let dev: TestDevice = Default::default();
 
-        let a: Tensor<_, TestDtype, _> = dev.tensor([
-            [0.5086, 0.5234, 0.2684],
-            [0.8075, 0.8437, 0.9951],
-            [0.0774, 0.7539, 0.8894],
-            [0.8119, 0.2693, 0.7249],
-        ]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.4651, 0.9106], [0.3360, 0.5534], [0.8092, 0.3827]]);
+        let a = dev
+            .tensor([
+                [0.5086, 0.5234, 0.2684],
+                [0.8075, 0.8437, 0.9951],
+                [0.0774, 0.7539, 0.8894],
+                [0.8119, 0.2693, 0.7249],
+            ])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.4651, 0.9106], [0.3360, 0.5534], [0.8092, 0.3827]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().matmul(b.clone());
         assert_close_to_literal!(
             r,
@@ -435,7 +438,7 @@ mod tests {
         }
         let gs = r.sum().backward();
         let a_grad = gs.get(&a).array();
-        let mut sub_bs_summed = [[0.0; 2]; 3];
+        let mut sub_bs_summed = [[Default::default(); 2]; 3];
         for i in 0..N {
             let sub_a = dev.tensor(a_array[i]);
             let sub_gs = sub_a.leaky_trace().matmul(b.clone()).sum().backward();
@@ -524,9 +527,10 @@ mod tests {
     fn test_matmul_vec_normal() {
         let dev: TestDevice = Default::default();
 
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.7296, 0.3974, 0.9487]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.7804, 0.5540], [0.5378, 0.8401], [0.5042, 0.8604]]);
+        let a = dev.tensor([0.7296, 0.3974, 0.9487]).to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.7804, 0.5540], [0.5378, 0.8401], [0.5042, 0.8604]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().matmul(b.clone());
         assert_close_to_literal!(r, [1.261436, 1.5543157]);
         let g = r.exp().mean().backward();
@@ -544,9 +548,10 @@ mod tests {
     #[test]
     fn test_matmul_vec_transpose() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.7296, 0.3974, 0.9487]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.7804, 0.5378, 0.5042], [0.5540, 0.8401, 0.8604]]);
+        let a = dev.tensor([0.7296, 0.3974, 0.9487]).to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.7804, 0.5378, 0.5042], [0.5540, 0.8401, 0.8604]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().matmul(b.leaky_trace().permute());
         assert_close_to_literal!(r, [1.261436, 1.5543157]);
         let g = r.exp().mean().backward();
@@ -563,9 +568,12 @@ mod tests {
     #[test]
     fn test_vecvec() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([-1.5333828, 0.6136148, -0.77502704, -1.0014728, -2.0131118]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([0.43068963, -0.9757187, -0.50650096]);
+        let a = dev
+            .tensor([-1.5333828, 0.6136148, -0.77502704, -1.0014728, -2.0131118])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([0.43068963, -0.9757187, -0.50650096])
+            .to_dtype::<TestDtype>();
         let c = a.leaky_trace().matmul(b.clone());
         let c_t = b.leaky_trace().matmul(a.clone()).permute();
         assert_eq!(c.array(), c_t.array());
@@ -592,8 +600,8 @@ mod tests {
     #[test]
     fn test_small_matmul_vv() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([2.0]);
+        let a = dev.tensor([0.5]).to_dtype::<TestDtype>();
+        let b = dev.tensor([2.0]).to_dtype::<TestDtype>();
         let c = a.leaky_trace().matmul(b.clone());
         assert_close_to_literal!(c, [[1.0]]);
         let g = c.exp().sum().backward();
@@ -606,8 +614,8 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         // 1 * 1x1
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0]]);
+        let a = dev.tensor([0.5]).to_dtype::<TestDtype>();
+        let b = dev.tensor([[2.0]]).to_dtype::<TestDtype>();
         let c = a.leaky_trace().matmul(b.clone());
         assert_close_to_literal!(c, [1.0]);
         let g = c.exp().sum().backward();
@@ -622,8 +630,8 @@ mod tests {
         assert_close_to_literal!(g.get(&b), [[1.3591409]]);
 
         // 1 * 1x2
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0, 4.0]]);
+        let a = dev.tensor([0.5]).to_dtype::<TestDtype>();
+        let b = dev.tensor([[2.0, 4.0]]).to_dtype::<TestDtype>();
         let c = a.leaky_trace().matmul(b.clone());
         let e: [f64; 2] = [1.0, 2.0];
         assert_close_to_literal!(c, e);
@@ -632,8 +640,8 @@ mod tests {
         assert_close_to_literal!(g.get(&b), [[1.3591409, 3.694528]]);
 
         // 1 * 1x2 (permuted)
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0], [4.0]]);
+        let a = dev.tensor([0.5]).to_dtype::<TestDtype>();
+        let b = dev.tensor([[2.0], [4.0]]).to_dtype::<TestDtype>();
         let c = a.leaky_trace().matmul(b.leaky_trace().permute());
         assert_close_to_literal!(c, e);
         let g = c.exp().sum().backward();
@@ -647,8 +655,8 @@ mod tests {
 
         {
             // 1x1 * 1x1
-            let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5]]);
-            let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0]]);
+            let a = dev.tensor([[0.5]]).to_dtype::<TestDtype>();
+            let b = dev.tensor([[2.0]]).to_dtype::<TestDtype>();
             let c = a.leaky_trace().matmul(b.clone());
             assert_close_to_literal!(c, [[1.0]]);
             let g = c.exp().sum().backward();
@@ -658,8 +666,8 @@ mod tests {
 
         {
             // 1x2 * 2x1
-            let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5, 0.1]]);
-            let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0], [4.0]]);
+            let a = dev.tensor([[0.5, 0.1]]).to_dtype::<TestDtype>();
+            let b = dev.tensor([[2.0], [4.0]]).to_dtype::<TestDtype>();
             let c = a.leaky_trace().matmul(b.clone());
             assert_close_to_literal!(c, [[1.4]]);
             let g = c.exp().sum().backward();
@@ -669,8 +677,8 @@ mod tests {
 
         {
             // 1x2 (permuted) * 2x1
-            let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5], [0.1]]);
-            let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0], [4.0]]);
+            let a = dev.tensor([[0.5], [0.1]]).to_dtype::<TestDtype>();
+            let b = dev.tensor([[2.0], [4.0]]).to_dtype::<TestDtype>();
             let c = a.leaky_trace().permute().matmul(b.clone());
             assert_close_to_literal!(c, [[1.4]]);
             let g = c.exp().sum().backward();
@@ -680,8 +688,8 @@ mod tests {
 
         {
             // 1x2 * 2x1 (permuted)
-            let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5, 0.1]]);
-            let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0, 4.0]]);
+            let a = dev.tensor([[0.5, 0.1]]).to_dtype::<TestDtype>();
+            let b = dev.tensor([[2.0, 4.0]]).to_dtype::<TestDtype>();
             let c = a.leaky_trace().matmul(b.leaky_trace().permute());
             assert_close_to_literal!(c, [[1.4]]);
             let g = c.exp().sum().backward();
diff --git a/src/tensor_ops/max_to/cuda_kernel.rs b/src/tensor_ops/max_to/cuda_kernel.rs
index 266f62a44..f58aa432b 100644
--- a/src/tensor_ops/max_to/cuda_kernel.rs
+++ b/src/tensor_ops/max_to/cuda_kernel.rs
@@ -16,6 +16,13 @@ trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const INIT: half::f16 = half::f16::NEG_INFINITY;
+    const MOD: &'static str = "max_f16";
+    const FNS: &'static [&'static str] = &["max_to_fwd_f16", "max_to_bwd_f16", "fill_with_f16"];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const INIT: f32 = f32::NEG_INFINITY;
     const MOD: &'static str = "max_f32";
diff --git a/src/tensor_ops/max_to/max_to.cu b/src/tensor_ops/max_to/max_to.cu
index 846245422..99c173356 100644
--- a/src/tensor_ops/max_to/max_to.cu
+++ b/src/tensor_ops/max_to/max_to.cu
@@ -1,23 +1,5 @@
 #include "cuda_utils.cuh"
 
-// atomicMax is not implemented for floats,
-// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
-__device__ __forceinline__ float atomicMaxf(float * addr, float value) {
-    if (signbit(value)) {
-        return __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));        
-    } else {
-        return __int_as_float(atomicMax((int *)addr, __float_as_int(value)));
-    }
-}
-
-__device__ __forceinline__ double atomicMaxf(double * addr, double value) {
-    if (signbit(value)) {
-        return __longlong_as_double(atomicMin((unsigned long long int *)addr, __double_as_longlong(value)));
-    } else {
-        return __longlong_as_double(atomicMax((long long int *)addr, __double_as_longlong(value)));
-    }
-}
-
 // Efficiently computes the max of each chunk in "data" of size chunk_len, and
 // stores the maximums in out[i / chunk_len]
 template<typename T>
@@ -140,5 +122,6 @@ extern "C" __global__ void BWD( \
     max_to_bwd(numel, num_dims, elems_per_thread, info, inp, grad_inp, out, grad_out); \
 }
 
+MAX(__half, max_to_fwd_f16, max_to_bwd_f16);
 MAX(float, max_to_fwd_f32, max_to_bwd_f32);
 MAX(double, max_to_fwd_f64, max_to_bwd_f64);
diff --git a/src/tensor_ops/max_to/mod.rs b/src/tensor_ops/max_to/mod.rs
index f9537384e..934575812 100644
--- a/src/tensor_ops/max_to/mod.rs
+++ b/src/tensor_ops/max_to/mod.rs
@@ -90,7 +90,9 @@ mod tests {
     #[test]
     fn test_max_axis_0_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().max::<_, Axis<0>>();
         assert_close_to_literal!(r, [3.0, 2.0, 2.0]);
         let g = r.exp().mean().backward();
@@ -103,7 +105,9 @@ mod tests {
     #[test]
     fn test_max_axis_1_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().max::<_, Axis<1>>();
         assert_close_to_literal!(r, [2.0, 3.0]);
         let g = r.sum().backward();
@@ -113,7 +117,7 @@ mod tests {
     #[test]
     fn test_max_axes_3d_to_1d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.sample_normal::<Rank3<2, 3, 4>>();
+        let t: Tensor<Rank3<2, 3, 4>, TestDtype, _> = dev.sample_normal();
         let r = t.leaky_trace().max::<Rank1<4>, _>();
         let r2 = t.leaky_trace().max::<_, Axis<0>>().max::<_, Axis<0>>();
         assert_close_to_tensor!(r, r2);
@@ -125,8 +129,9 @@ mod tests {
     #[test]
     fn test_max_negative_zero() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> =
-            dev.tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]]);
+        let t = dev
+            .tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().max::<_, Axis<1>>();
         assert_close_to_literal!(r, [0.0, 0.0, -0.0, 0.0]);
         let g = r.sum().backward();
diff --git a/src/tensor_ops/maximum/cuda_kernel.rs b/src/tensor_ops/maximum/cuda_kernel.rs
index 8afafe759..7e71e6b62 100644
--- a/src/tensor_ops/maximum/cuda_kernel.rs
+++ b/src/tensor_ops/maximum/cuda_kernel.rs
@@ -5,6 +5,15 @@ unsafe impl cudarc::driver::DeviceRepr for Max {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/maximum.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_binary!(
+    Max,
+    half::f16,
+    PTX,
+    "maximum_fwd_f16",
+    "maximum_bwd_lhs_f16",
+    "maximum_bwd_rhs_f16"
+);
 cuda_binary!(
     Max,
     f32,
diff --git a/src/tensor_ops/maximum/maximum.cu b/src/tensor_ops/maximum/maximum.cu
index 81735c71a..8066e78bf 100644
--- a/src/tensor_ops/maximum/maximum.cu
+++ b/src/tensor_ops/maximum/maximum.cu
@@ -17,6 +17,12 @@ __device__ T op_dfdy(T x, T y) {
     return (x > y) ? 0.0 : ((x < y) ? 1.0 : 0.5);
 }
 
+BINARY_OP(__half, maximum_fwd_f16, maximum_bwd_lhs_f16, maximum_bwd_rhs_f16, MaximumKernalOp,
+    op_f(x, y),
+    op_dfdx(x, y),
+    op_dfdy(x, y)
+)
+
 BINARY_OP(float, maximum_fwd_f32, maximum_bwd_lhs_f32, maximum_bwd_rhs_f32, MaximumKernalOp,
     op_f(x, y),
     op_dfdx(x, y),
diff --git a/src/tensor_ops/maximum/mod.rs b/src/tensor_ops/maximum/mod.rs
index fbe3bbf96..5b514ddc8 100644
--- a/src/tensor_ops/maximum/mod.rs
+++ b/src/tensor_ops/maximum/mod.rs
@@ -54,8 +54,12 @@ mod tests {
     #[test]
     fn test_maximum() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]]);
+        let a = dev
+            .tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]])
+            .to_dtype::<TestDtype>();
 
         let result = a.leaky_trace().maximum(b.clone());
         assert_close_to_literal!(result, [[0.0, 0.0, 1.0], [3.0, 4.0, 5.0]]);
diff --git a/src/tensor_ops/mean_to.rs b/src/tensor_ops/mean_to.rs
index 911ae4544..0012dd856 100644
--- a/src/tensor_ops/mean_to.rs
+++ b/src/tensor_ops/mean_to.rs
@@ -71,7 +71,7 @@ mod tests {
     #[test]
     fn test_mean_1d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
+        let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
         let r = t.leaky_trace().mean();
         assert_close_to_literal!(r, 2.0);
         // NOTE: .exp() so we cover the case where .mean() has to use result grad.
@@ -82,7 +82,9 @@ mod tests {
     #[test]
     fn test_mean_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().mean();
         assert_close_to_literal!(r, 3.5);
         let g = r.backward();
@@ -92,7 +94,7 @@ mod tests {
     #[test]
     fn test_mean_3d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.ones::<Rank3<4, 2, 3>>();
+        let t: Tensor<Rank3<4, 2, 3>, TestDtype, _> = dev.ones();
         let r = t.leaky_trace().mean();
         assert_close_to_literal!(r, 1.0);
         let g = r.backward();
@@ -102,7 +104,9 @@ mod tests {
     #[test]
     fn test_mean_axis_0_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().mean::<Rank1<3>, _>();
         assert_close_to_literal!(r, [-0.5, 3.0, -1.5]);
         let g = r.exp().mean().backward();
@@ -112,7 +116,9 @@ mod tests {
     #[test]
     fn test_mean_axis_1_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().mean::<Rank1<2>, _>();
         assert_close_to_literal!(r, [2.0, -4.0 / 3.0]);
         let g = r.exp().mean().backward();
diff --git a/src/tensor_ops/min_to/cuda_kernel.rs b/src/tensor_ops/min_to/cuda_kernel.rs
index 6efcb825e..fb7d1f07b 100644
--- a/src/tensor_ops/min_to/cuda_kernel.rs
+++ b/src/tensor_ops/min_to/cuda_kernel.rs
@@ -16,6 +16,13 @@ trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const INIT: half::f16 = half::f16::INFINITY;
+    const MOD: &'static str = "min_f16";
+    const FNS: &'static [&'static str] = &["min_to_fwd_f16", "min_to_bwd_f16", "fill_with_f16"];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const INIT: f32 = f32::INFINITY;
     const MOD: &'static str = "min_f32";
diff --git a/src/tensor_ops/min_to/min_to.cu b/src/tensor_ops/min_to/min_to.cu
index ffd860729..8ba24c158 100644
--- a/src/tensor_ops/min_to/min_to.cu
+++ b/src/tensor_ops/min_to/min_to.cu
@@ -1,23 +1,5 @@
 #include "cuda_utils.cuh"
 
-// atomicMax is not implemented for floats,
-// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
-__device__ __forceinline__ float atomicMinf(float * addr, float value) {
-    if (signbit(value)) {
-        return __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));
-    } else {
-        return __int_as_float(atomicMin((int *)addr, __float_as_int(value)));
-    }
-}
-
-__device__ __forceinline__ double atomicMinf(double * addr, double value) {
-    if (signbit(value)) {
-        return __longlong_as_double(atomicMax((unsigned long long int *)addr, __double_as_longlong(value)));
-    } else {
-        return __longlong_as_double(atomicMin((long long int *)addr, __double_as_longlong(value)));
-    }
-}
-
 // Efficiently computes the min of each chunk in "data" of size chunk_len, and
 // stores the minimums in out[i / chunk_len]
 template<typename T>
@@ -140,5 +122,6 @@ extern "C" __global__ void BWD( \
     min_to_bwd(numel, num_dims, elems_per_thread, info, inp, grad_inp, out, grad_out); \
 }
 
+MIN(__half, min_to_fwd_f16, min_to_bwd_f16);
 MIN(float, min_to_fwd_f32, min_to_bwd_f32);
 MIN(double, min_to_fwd_f64, min_to_bwd_f64);
diff --git a/src/tensor_ops/min_to/mod.rs b/src/tensor_ops/min_to/mod.rs
index 690c78de3..f392c4c8a 100644
--- a/src/tensor_ops/min_to/mod.rs
+++ b/src/tensor_ops/min_to/mod.rs
@@ -90,7 +90,9 @@ mod tests {
     #[test]
     fn test_min_axis_0_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]]);
+        let t = dev
+            .tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().min::<Rank1<3>, _>();
         assert_close_to_literal!(r, [1.0, -2.0, 2.0]);
         let g = r.exp().mean().backward();
@@ -103,7 +105,9 @@ mod tests {
     #[test]
     fn test_min_axis_1_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]]);
+        let t = dev
+            .tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().min::<Rank1<2>, _>();
         assert_close_to_literal!(r, [1.0, -2.0]);
         let g = r.sum().backward();
@@ -113,7 +117,7 @@ mod tests {
     #[test]
     fn test_min_axes_3d_to_1d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.sample_normal::<Rank3<2, 3, 4>>();
+        let t: Tensor<Rank3<2, 3, 4>, TestDtype, _> = dev.sample_normal();
         let r = t.leaky_trace().min::<Rank1<4>, _>();
         let r2 = t.leaky_trace().min::<Rank2<3, 4>, _>().min::<Rank1<4>, _>();
         assert_close_to_tensor!(r, r2);
@@ -125,8 +129,9 @@ mod tests {
     #[test]
     fn test_min_negative_zero() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> =
-            dev.tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]]);
+        let t = dev
+            .tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().min::<_, Axis<1>>();
         assert_close_to_literal!(r, [-0.0, -0.0, -1.0, -1.0]);
         let g = r.sum().backward();
diff --git a/src/tensor_ops/minimum/cuda_kernel.rs b/src/tensor_ops/minimum/cuda_kernel.rs
index 4368149f7..deb9a8f70 100644
--- a/src/tensor_ops/minimum/cuda_kernel.rs
+++ b/src/tensor_ops/minimum/cuda_kernel.rs
@@ -5,6 +5,15 @@ unsafe impl cudarc::driver::DeviceRepr for super::MinimumKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/minimum.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_binary!(
+    Min,
+    half::f16,
+    PTX,
+    "minimum_fwd_f16",
+    "minimum_bwd_lhs_f16",
+    "minimum_bwd_rhs_f16"
+);
 cuda_binary!(
     Min,
     f32,
diff --git a/src/tensor_ops/minimum/minimum.cu b/src/tensor_ops/minimum/minimum.cu
index 058b999e6..8417bd4e0 100644
--- a/src/tensor_ops/minimum/minimum.cu
+++ b/src/tensor_ops/minimum/minimum.cu
@@ -17,6 +17,12 @@ __device__ T op_dfdy(T x, T y) {
     return (x < y) ? 0.0 : ((x > y) ? 1.0 : 0.5);
 }
 
+BINARY_OP(__half, minimum_fwd_f16, minimum_bwd_lhs_f16, minimum_bwd_rhs_f16, MinimumKernelOp,
+    op_f(x, y),
+    op_dfdx(x, y),
+    op_dfdy(x, y)
+)
+
 BINARY_OP(float, minimum_fwd_f32, minimum_bwd_lhs_f32, minimum_bwd_rhs_f32, MinimumKernelOp,
     op_f(x, y),
     op_dfdx(x, y),
diff --git a/src/tensor_ops/minimum/mod.rs b/src/tensor_ops/minimum/mod.rs
index 529d7b792..c0d766082 100644
--- a/src/tensor_ops/minimum/mod.rs
+++ b/src/tensor_ops/minimum/mod.rs
@@ -53,8 +53,12 @@ mod tests {
     #[test]
     fn test_minimum() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]]);
+        let a = dev
+            .tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]])
+            .to_dtype::<TestDtype>();
 
         let result = a.leaky_trace().minimum(b.clone());
         assert_close_to_literal!(result, [[-1., 0., -1.], [3., -4., -5.]]);
diff --git a/src/tensor_ops/mul/binary_mul.cu b/src/tensor_ops/mul/binary_mul.cu
index d14056e64..881722e20 100644
--- a/src/tensor_ops/mul/binary_mul.cu
+++ b/src/tensor_ops/mul/binary_mul.cu
@@ -2,6 +2,11 @@
 
 struct BinaryMulKernalOp {};
 
+BINARY_OP(__half, bmul_fwd_f16, bmul_bwd_lhs_f16, bmul_bwd_rhs_f16, BinaryMulKernalOp,
+    x * y,
+    y,
+    x)
+
 BINARY_OP(float, bmul_fwd_f32, bmul_bwd_lhs_f32, bmul_bwd_rhs_f32, BinaryMulKernalOp,
     x * y,
     y,
diff --git a/src/tensor_ops/mul/cuda_kernel.rs b/src/tensor_ops/mul/cuda_kernel.rs
index 9f008f828..9eca6a4fe 100644
--- a/src/tensor_ops/mul/cuda_kernel.rs
+++ b/src/tensor_ops/mul/cuda_kernel.rs
@@ -1,6 +1,8 @@
 use super::{BinaryMulKernelOp as Binary, ScalarMulKernelOp as Scalar};
 use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary};
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for Scalar<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f64> {}
 unsafe impl cudarc::driver::DeviceRepr for Binary {}
@@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {}
 const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_mul.ptx"));
 const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_mul.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(const_df() Scalar<half::f16>, half::f16, SCALAR_PTX, "smul_fwd_f16", "smul_bwd_f16");
 cuda_unary!(const_df() Scalar<f32>, f32, SCALAR_PTX, "smul_fwd_f32", "smul_bwd_f32");
 cuda_unary!(const_df() Scalar<f64>, f64, SCALAR_PTX, "smul_fwd_f64", "smul_bwd_f64");
+#[cfg(feature = "f16")]
+cuda_binary!(
+    Binary,
+    half::f16,
+    BINARY_PTX,
+    "bmul_fwd_f16",
+    "bmul_bwd_lhs_f16",
+    "bmul_bwd_rhs_f16"
+);
 cuda_binary!(
     Binary,
     f32,
diff --git a/src/tensor_ops/mul/mod.rs b/src/tensor_ops/mul/mod.rs
index ff45b2c47..909bbd460 100644
--- a/src/tensor_ops/mul/mod.rs
+++ b/src/tensor_ops/mul/mod.rs
@@ -68,6 +68,16 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ScalarMulKernelOp<E>, E>, T: Tape<E, D>>
     }
 }
 
+#[cfg(feature = "f16")]
+impl<S: Shape, D: UnaryKernel<ScalarMulKernelOp<half::f16>, half::f16>, T: Tape<half::f16, D>>
+    TryMul<f32> for Tensor<S, half::f16, D, T>
+{
+    fn try_mul(self, rhs: f32) -> Result<Self, Self::Err> {
+        let scalar = half::f16::from_f32(rhs);
+        try_unary_op(ScalarMulKernelOp { scalar }, self)
+    }
+}
+
 impl<S: Shape, E: Dtype, D: DeviceStorage, LhsTape: Tape<E, D>, Rhs> std::ops::Mul<Rhs>
     for Tensor<S, E, D, LhsTape>
 where
@@ -85,8 +95,8 @@ mod tests {
     #[test]
     fn test_mul_0d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor(2.0);
-        let b: Tensor<_, TestDtype, _> = dev.tensor(3.0);
+        let a = dev.tensor(2.0).to_dtype::<TestDtype>();
+        let b = dev.tensor(3.0).to_dtype::<TestDtype>();
 
         let r = a.leaky_trace() * b.clone();
         assert_close_to_literal!(r, 6.0);
@@ -98,8 +108,8 @@ mod tests {
     #[test]
     fn test_mul_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]);
+        let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
+        let b = dev.tensor([1.0, -1.0, 0.0]).to_dtype::<TestDtype>();
 
         let r = a.leaky_trace() * b.clone();
         assert_close_to_literal!(r, [1.0, -2.0, 0.0]);
@@ -111,10 +121,12 @@ mod tests {
     #[test]
     fn test_mul_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]);
+        let a = dev
+            .tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]])
+            .to_dtype::<TestDtype>();
 
         let r = a.leaky_trace() * b.clone();
         assert_close_to_literal!(
@@ -144,7 +156,7 @@ mod tests {
     #[test]
     fn test_scalar_mul_0d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor(1.0);
+        let x = dev.tensor(1.0).to_dtype::<TestDtype>();
         let r = x.leaky_trace() * 0.5;
         assert_close_to_literal!(r, 0.5);
         let g = r.exp().backward();
@@ -154,7 +166,7 @@ mod tests {
     #[test]
     fn test_scalar_mul_1d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]);
+        let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() * 0.5;
         assert_close_to_literal!(r, [0.0, 0.5, 1.0]);
         let g = r.exp().sum().backward();
@@ -164,7 +176,7 @@ mod tests {
     #[test]
     fn test_scalar_mul_2d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0; 2]; 3]);
+        let x = dev.tensor([[1.0; 2]; 3]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() * 0.5;
         assert_close_to_literal!(r, [[0.5; 2]; 3]);
         let g = r.exp().sum().backward();
diff --git a/src/tensor_ops/mul/scalar_mul.cu b/src/tensor_ops/mul/scalar_mul.cu
index 498decf7c..0062eb1cf 100644
--- a/src/tensor_ops/mul/scalar_mul.cu
+++ b/src/tensor_ops/mul/scalar_mul.cu
@@ -5,6 +5,10 @@ struct ScalarMulKernelOp {
     F scalar;
 };
 
+UNARY_OP(__half, smul_fwd_f16, smul_bwd_f16, ScalarMulKernelOp<__half>,
+    x * op.scalar,
+    op.scalar);
+
 UNARY_OP(float, smul_fwd_f32, smul_bwd_f32, ScalarMulKernelOp<float>,
     x * op.scalar,
     op.scalar);
diff --git a/src/tensor_ops/nans_to/cuda_kernel.rs b/src/tensor_ops/nans_to/cuda_kernel.rs
index 1fd8574bd..dae060434 100644
--- a/src/tensor_ops/nans_to/cuda_kernel.rs
+++ b/src/tensor_ops/nans_to/cuda_kernel.rs
@@ -1,10 +1,20 @@
 use super::NansToKernelOp as NansTo;
 use crate::tensor_ops::cuda_kernels::cuda_unary;
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for NansTo<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for NansTo<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for NansTo<f64> {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/nans_to.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    NansTo<half::f16>,
+    half::f16,
+    PTX,
+    "nans_to_fwd_f16",
+    "nans_to_bwd_f16"
+);
 cuda_unary!(NansTo<f32>, f32, PTX, "nans_to_fwd_f32", "nans_to_bwd_f32");
 cuda_unary!(NansTo<f64>, f64, PTX, "nans_to_fwd_f64", "nans_to_bwd_f64");
diff --git a/src/tensor_ops/nans_to/mod.rs b/src/tensor_ops/nans_to/mod.rs
index fa98308da..2909238df 100644
--- a/src/tensor_ops/nans_to/mod.rs
+++ b/src/tensor_ops/nans_to/mod.rs
@@ -24,19 +24,19 @@ pub struct NansToKernelOp<E>(E);
 /// ```
 pub fn nans_to<S: Shape, E: Dtype, D: UnaryKernel<NansToKernelOp<E>, E>, T: Tape<E, D>>(
     t: Tensor<S, E, D, T>,
-    value: impl Into<E>,
+    value: impl Into<f64>,
 ) -> Tensor<S, E, D, T> {
     t.nans_to(value)
 }
 
 impl<S: Shape, E: Dtype, D: UnaryKernel<NansToKernelOp<E>, E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [nans_to]
-    pub fn nans_to(self, value: impl Into<E>) -> Self {
+    pub fn nans_to(self, value: impl Into<f64>) -> Self {
         self.try_nans_to(value).unwrap()
     }
     /// See [nans_to]
-    pub fn try_nans_to(self, value: impl Into<E>) -> Result<Self, D::Err> {
-        let value = value.into();
+    pub fn try_nans_to(self, value: impl Into<f64>) -> Result<Self, D::Err> {
+        let value = E::from_f64(value.into()).unwrap();
         try_unary_op(NansToKernelOp(value), self)
     }
 }
@@ -48,7 +48,9 @@ mod tests {
     #[test]
     fn test_nans_1d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, TestDtype::NAN, -TestDtype::NAN, 4.0]);
+        let t = dev
+            .tensor([1.0, f64::NAN, -f64::NAN, 4.0])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().nans_to(0.0);
         assert_close_to_literal!(r, [1.0, 0.0, 0.0, 4.0]);
         // NOTE: .exp() so we cover case where nans_to() needs to use result grad
diff --git a/src/tensor_ops/nans_to/nans_to.cu b/src/tensor_ops/nans_to/nans_to.cu
index 9d27d1f10..6842b6d13 100644
--- a/src/tensor_ops/nans_to/nans_to.cu
+++ b/src/tensor_ops/nans_to/nans_to.cu
@@ -5,11 +5,15 @@ struct NansToKernelOp {
     F x;
 };
 
+UNARY_OP(__half, nans_to_fwd_f16, nans_to_bwd_f16, NansToKernelOp<__half>,
+    isnang(x) ? op.x : x,
+    isnang(x) ? 0.0 : 1.0)
+
 UNARY_OP(float, nans_to_fwd_f32, nans_to_bwd_f32, NansToKernelOp<float>,
-    isnan(x) ? op.x : x,
-    isnan(x) ? 0.0 : 1.0)
+    isnang(x) ? op.x : x,
+    isnang(x) ? 0.0 : 1.0)
 
 UNARY_OP(double, nans_to_fwd_f64, nans_to_bwd_f64, NansToKernelOp<double>,
-    isnan(x) ? op.x : x,
-    isnan(x) ? 0.0 : 1.0)
+    isnang(x) ? op.x : x,
+    isnang(x) ? 0.0 : 1.0)
     
\ No newline at end of file
diff --git a/src/tensor_ops/negate/cuda_kernel.rs b/src/tensor_ops/negate/cuda_kernel.rs
index 752d841cc..a6065e555 100644
--- a/src/tensor_ops/negate/cuda_kernel.rs
+++ b/src/tensor_ops/negate/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for NegateKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/negate.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(const_df() NegateKernelOp, half::f16, PTX, "negate_fwd_f16", "negate_bwd_f16");
 cuda_unary!(const_df() NegateKernelOp, f32, PTX, "negate_fwd_f32", "negate_bwd_f32");
 cuda_unary!(const_df() NegateKernelOp, f64, PTX, "negate_fwd_f64", "negate_bwd_f64");
diff --git a/src/tensor_ops/negate/mod.rs b/src/tensor_ops/negate/mod.rs
index 39fe4529d..bc36bb95b 100644
--- a/src/tensor_ops/negate/mod.rs
+++ b/src/tensor_ops/negate/mod.rs
@@ -51,7 +51,7 @@ mod tests {
     #[test]
     fn test_1d_neg() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, 0.0, 5.0]);
+        let a = dev.tensor([-2.0, 0.0, 5.0]).to_dtype::<TestDtype>();
         let r = -(a.leaky_trace());
         assert_close_to_literal!(r, [2.0, 0.0, -5.0]);
         // NOTE: .exp() so we can make sure neg is using result grad properly
diff --git a/src/tensor_ops/negate/negate.cu b/src/tensor_ops/negate/negate.cu
index 701e0403b..f522cf93b 100644
--- a/src/tensor_ops/negate/negate.cu
+++ b/src/tensor_ops/negate/negate.cu
@@ -2,6 +2,10 @@
 
 struct NegateKernelOp {};
 
+UNARY_OP(__half, negate_fwd_f16, negate_bwd_f16, NegateKernelOp,
+        -x,
+        -1.0)
+
 UNARY_OP(float, negate_fwd_f32, negate_bwd_f32, NegateKernelOp,
         -x,
         -1.0)
diff --git a/src/tensor_ops/normalize.rs b/src/tensor_ops/normalize.rs
index 2da3db0ff..3e929dcf5 100644
--- a/src/tensor_ops/normalize.rs
+++ b/src/tensor_ops/normalize.rs
@@ -17,14 +17,14 @@ use super::{BroadcastTo, Device, MeanTo, TryAdd, TryDiv, TrySub};
 /// ```
 pub fn normalize<Ax: Axes, S: Shape + ReduceShape<Ax>, E: Dtype, D: Device<E>, T: Tape<E, D>>(
     t: Tensor<S, E, D, T>,
-    epsilon: impl Into<E>,
+    epsilon: impl Into<f64>,
 ) -> Tensor<S, E, D, T> {
     t.normalize::<Ax>(epsilon)
 }
 
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [normalize]
-    pub fn normalize<Ax: Axes>(self, epsilon: impl Into<E>) -> Self
+    pub fn normalize<Ax: Axes>(self, epsilon: impl Into<f64>) -> Self
     where
         S: ReduceShape<Ax>,
     {
@@ -34,7 +34,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [normalize]
     pub fn try_normalize<Ax: Axes>(
         self,
-        epsilon: impl Into<E>,
+        epsilon: impl Into<f64>,
     ) -> Result<Self, <Self as HasErr>::Err>
     where
         S: ReduceShape<Ax>,
@@ -46,7 +46,7 @@ impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> Tensor<S, E, D, T> {
             .retaped::<T>()
             .try_square()?
             .try_mean::<_, Ax>()?
-            .try_add(epsilon.into())?
+            .try_add(E::from_f64(epsilon.into()).unwrap())?
             .try_sqrt()?;
         centered.try_div(std.try_broadcast_like(&shape)?)
     }
@@ -60,7 +60,7 @@ mod tests {
     #[test]
     fn test_1d_normalize_axis_last() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, 0.0, 5.0]);
+        let a = dev.tensor([-2.0, 0.0, 5.0]).to_dtype::<TestDtype>();
         let r = a.leaky_trace().normalize(1e-5);
         assert_close_to_literal!(&r, [-1.0190487, -0.3396829, 1.3587316]);
         // NOTE: .exp() so we can make sure normalize is using result grad properly
@@ -71,7 +71,9 @@ mod tests {
     #[test]
     fn test_2d_normalize_axis_last() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, 0.0, 5.0], [1.0, 2.0, 3.0]]);
+        let a = dev
+            .tensor([[-2.0, 0.0, 5.0], [1.0, 2.0, 3.0]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().normalize::<Axis<1>>(1e-5);
         assert_close_to_literal!(
             r,
@@ -93,7 +95,9 @@ mod tests {
     #[test]
     fn test_2d_normalize_axis_first() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, 0.0], [1.0, 2.0], [4.0, 5.0]]);
+        let a = dev
+            .tensor([[-2.0, 0.0], [1.0, 2.0], [4.0, 5.0]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().normalize::<Axis<0>>(1e-5);
         assert_close_to_literal!(
             r,
diff --git a/src/tensor_ops/pool2d/cuda_kernel.rs b/src/tensor_ops/pool2d/cuda_kernel.rs
index b38d1f209..d750fdfb1 100644
--- a/src/tensor_ops/pool2d/cuda_kernel.rs
+++ b/src/tensor_ops/pool2d/cuda_kernel.rs
@@ -74,6 +74,25 @@ macro_rules! pool_impl {
     };
 }
 
+#[cfg(feature = "f16")]
+pool_impl!(
+    AvgPool2DKernel<half::f16>,
+    "avg_pool2d_fwd_f16",
+    "avg_pool2d_bwd_f16"
+);
+#[cfg(feature = "f16")]
+pool_impl!(
+    MaxPool2DKernel<half::f16>,
+    "max_pool2d_fwd_f16",
+    "max_pool2d_bwd_f16"
+);
+#[cfg(feature = "f16")]
+pool_impl!(
+    MinPool2DKernel<half::f16>,
+    "min_pool2d_fwd_f16",
+    "min_pool2d_bwd_f16"
+);
+
 pool_impl!(
     AvgPool2DKernel<f32>,
     "avg_pool2d_fwd_f32",
diff --git a/src/tensor_ops/pool2d/mod.rs b/src/tensor_ops/pool2d/mod.rs
index bf98f9d84..666f23e65 100644
--- a/src/tensor_ops/pool2d/mod.rs
+++ b/src/tensor_ops/pool2d/mod.rs
@@ -201,7 +201,9 @@ mod tests {
     #[test]
     fn test_pool2d_3d_max2d_eq_grads() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[[1.0, 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]]);
+        let x = dev
+            .tensor([[[1.0, 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().max_pool2d::<2, 1, 0>();
         assert_close_to_literal!(r, [[[1., 1., 1.2]]]);
         let g = r.sum().backward();
@@ -211,7 +213,9 @@ mod tests {
     #[test]
     fn test_pool2d_3d_min2d_eq_grads() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[[1., 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]]);
+        let x = dev
+            .tensor([[[1., 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().min_pool2d::<2, 1, 0>();
         assert_close_to_literal!(r, [[[0.2, 0.2, 0.2]]]);
         let g = r.sum().backward();
diff --git a/src/tensor_ops/pool2d/pool2d.cu b/src/tensor_ops/pool2d/pool2d.cu
index 8550c2b6f..a39d86d11 100644
--- a/src/tensor_ops/pool2d/pool2d.cu
+++ b/src/tensor_ops/pool2d/pool2d.cu
@@ -53,7 +53,9 @@ __device__ void avg_pool2d_fwd(
         }
     }
 
-    tmp /= static_cast<T>(op.kernel * op.kernel);
+    double num_f64 = op.kernel * op.kernel;
+    T num = num_f64;
+    tmp /= num;
     out[i] = tmp;
 }
 
@@ -105,7 +107,9 @@ __device__ void avg_pool2d_bwd(
         }
     }
 
-    grad_inp[i] += tmp / static_cast<T>(op.kernel * op.kernel);
+    double num_f64 = op.kernel * op.kernel;
+    T num = num_f64;
+    grad_inp[i] += tmp / num;
 }
 
 template<typename T>
@@ -330,6 +334,22 @@ extern "C" __global__ void bwd( \
     bwd_FN(op, inp_strides, out_strides, inp, grad_inp, out, grad_out); \
 }
 
+POOL_OP(
+    __half,
+    avg_pool2d_fwd_f16, avg_pool2d_bwd_f16,
+    avg_pool2d_fwd, avg_pool2d_bwd
+);
+POOL_OP(
+    __half,
+    min_pool2d_fwd_f16, min_pool2d_bwd_f16,
+    min_pool2d_fwd, min_pool2d_bwd
+);
+POOL_OP(
+    __half,
+    max_pool2d_fwd_f16, max_pool2d_bwd_f16,
+    max_pool2d_fwd, max_pool2d_bwd
+);
+
 POOL_OP(
     float,
     avg_pool2d_fwd_f32, avg_pool2d_bwd_f32,
diff --git a/src/tensor_ops/pow/cuda_kernel.rs b/src/tensor_ops/pow/cuda_kernel.rs
index 918d90f79..792a3bf04 100644
--- a/src/tensor_ops/pow/cuda_kernel.rs
+++ b/src/tensor_ops/pow/cuda_kernel.rs
@@ -6,11 +6,21 @@ use crate::{
 };
 use std::borrow::Cow;
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for super::PowfKernelOp<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for super::PowfKernelOp<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for super::PowfKernelOp<f64> {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/pow.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    PowfKernelOp<half::f16>,
+    half::f16,
+    PTX,
+    "pow_fwd_f16",
+    "pow_bwd_f16"
+);
 cuda_unary!(PowfKernelOp<f32>, f32, PTX, "pow_fwd_f32", "pow_bwd_f32");
 cuda_unary!(PowfKernelOp<f64>, f64, PTX, "pow_fwd_f64", "pow_bwd_f64");
 
diff --git a/src/tensor_ops/pow/mod.rs b/src/tensor_ops/pow/mod.rs
index 6cdc24580..ecef22110 100644
--- a/src/tensor_ops/pow/mod.rs
+++ b/src/tensor_ops/pow/mod.rs
@@ -23,19 +23,19 @@ pub struct PowfKernelOp<E>(E);
 /// ```
 pub fn powf<S: Shape, E: Dtype, D: UnaryKernel<PowfKernelOp<E>, E>, T: Tape<E, D>>(
     t: Tensor<S, E, D, T>,
-    exponent: impl Into<E>,
+    exponent: impl Into<f64>,
 ) -> Tensor<S, E, D, T> {
     t.powf(exponent)
 }
 
 impl<S: Shape, E: Dtype, D: UnaryKernel<PowfKernelOp<E>, E>, T: Tape<E, D>> Tensor<S, E, D, T> {
     /// See [powf]
-    pub fn powf(self, exponent: impl Into<E>) -> Self {
+    pub fn powf(self, exponent: impl Into<f64>) -> Self {
         self.try_powf(exponent).unwrap()
     }
     /// See [powf]
-    pub fn try_powf(self, exponent: impl Into<E>) -> Result<Self, D::Err> {
-        let exponent = exponent.into();
+    pub fn try_powf(self, exponent: impl Into<f64>) -> Result<Self, D::Err> {
+        let exponent = E::from_f64(exponent.into()).unwrap();
         try_unary_op(PowfKernelOp(exponent), self)
     }
 }
@@ -72,49 +72,55 @@ mod tests {
     #[test]
     fn test_powf_positive() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let t = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().powf(3.5);
         let r_array = r.array();
         assert!(r_array[0].is_nan());
         assert!(r_array[1].is_nan());
-        assert_close!(r_array[2], 0.0);
-        assert_close!(r_array[3], 1.0);
-        assert_close!(r_array[4], 11.313708);
+        assert_close!(r_array[2], NumCast::from(0.0).unwrap());
+        assert_close!(r_array[3], NumCast::from(1.0).unwrap());
+        assert_close!(r_array[4], NumCast::from(11.313708).unwrap());
 
         let g = r.sum().backward();
         let grad = g.get(&t).array();
         assert!(grad[0].is_nan());
         assert!(grad[1].is_nan());
-        assert_close!(grad[2], 0.0);
-        assert_close!(grad[3], 3.5);
-        assert_close!(grad[4], 19.79899);
+        assert_close!(grad[2], NumCast::from(0.0).unwrap());
+        assert_close!(grad[3], NumCast::from(3.5).unwrap());
+        assert_close!(grad[4], NumCast::from(19.79899).unwrap());
     }
 
     #[test]
     fn test_powf_negative() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let t = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().powf(-1.2);
         let r_array = r.array();
         assert!(r_array[0].is_nan());
         assert!(r_array[1].is_nan());
         assert_close!(r_array[2], TestDtype::INFINITY);
-        assert_close!(r_array[3], 1.0);
-        assert_close!(r_array[4], 0.43527526);
+        assert_close!(r_array[3], NumCast::from(1.0).unwrap());
+        assert_close!(r_array[4], NumCast::from(0.43527526).unwrap());
 
         let g = r.sum().backward();
         let grad = g.get(&t).array();
         assert!(grad[0].is_nan());
         assert!(grad[1].is_nan());
         assert_close!(grad[2], TestDtype::NEG_INFINITY);
-        assert_close!(grad[3], -1.2);
-        assert_close!(grad[4], -0.26116517);
+        assert_close!(grad[3], NumCast::from(-1.2).unwrap());
+        assert_close!(grad[4], NumCast::from(-0.26116517).unwrap());
     }
 
     #[test]
     fn test_powi_positive() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let t = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().powi(3);
         assert_close_to_literal!(r, [-8., -1., 0., 1., 8.]);
         let g = r.sum().backward();
@@ -124,7 +130,9 @@ mod tests {
     #[test]
     fn test_powi_negative() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let t = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().powi(-3);
         assert_close_to_literal!(r, [-0.125, -1.0, f64::INFINITY, 1.0, 0.125]);
         let g = r.sum().backward();
diff --git a/src/tensor_ops/pow/pow.cu b/src/tensor_ops/pow/pow.cu
index ef0be3c79..2c73035e1 100644
--- a/src/tensor_ops/pow/pow.cu
+++ b/src/tensor_ops/pow/pow.cu
@@ -5,11 +5,20 @@ struct PowFKernelOp {
     F rhs;
 };
 
+template<typename T>
+__device__ T pow_bwd(PowFKernelOp<T> op, T x) {
+    T one = 1.0;
+    return op.rhs * powg(x, op.rhs - one);
+}
+
+UNARY_OP(__half, pow_fwd_f16, pow_bwd_f16, PowFKernelOp<__half>,
+    powg(x, op.rhs),
+    pow_bwd(op, x))
+
 UNARY_OP(float, pow_fwd_f32, pow_bwd_f32, PowFKernelOp<float>,
-        powf(x, op.rhs),
-        op.rhs * powf(x, op.rhs - 1.0))
+    powg(x, op.rhs),
+    pow_bwd(op, x))
 
 UNARY_OP(double, pow_fwd_f64, pow_bwd_f64, PowFKernelOp<double>,
-    pow(x, op.rhs),
-    op.rhs * pow(x, op.rhs - 1.0))
-    
\ No newline at end of file
+    powg(x, op.rhs),
+    pow_bwd(op, x))
diff --git a/src/tensor_ops/prelu.rs b/src/tensor_ops/prelu.rs
index 1b0e4f891..485b9766b 100644
--- a/src/tensor_ops/prelu.rs
+++ b/src/tensor_ops/prelu.rs
@@ -91,8 +91,12 @@ mod tests {
     #[test]
     fn test_prelu() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
-        let y: Tensor<_, TestDtype, _> = dev.tensor([0.05, 0.05, 0.05, 0.05, 0.05]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
+        let y = dev
+            .tensor([0.05, 0.05, 0.05, 0.05, 0.05])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().prelu(y.clone());
         assert_close_to_literal!(r, [-0.1, -0.05, 0.0, 1.0, 2.0]);
         // NOTE: call .exp() to make sure we cover cases where .prelu() uses the result's gradient
diff --git a/src/tensor_ops/recip/cuda_kernel.rs b/src/tensor_ops/recip/cuda_kernel.rs
index 66b539b2f..145fc0eae 100644
--- a/src/tensor_ops/recip/cuda_kernel.rs
+++ b/src/tensor_ops/recip/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for RecipKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/recip.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(df(f(x)) RecipKernelOp, half::f16, PTX, "recip_fwd_f16", "recip_bwd_f16");
 cuda_unary!(df(f(x)) RecipKernelOp, f32, PTX, "recip_fwd_f32", "recip_bwd_f32");
 cuda_unary!(df(f(x)) RecipKernelOp, f64, PTX, "recip_fwd_f64", "recip_bwd_f64");
diff --git a/src/tensor_ops/recip/mod.rs b/src/tensor_ops/recip/mod.rs
index fa738b3a5..78eb28792 100644
--- a/src/tensor_ops/recip/mod.rs
+++ b/src/tensor_ops/recip/mod.rs
@@ -43,7 +43,9 @@ mod tests {
     #[test]
     fn test_recip() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().recip();
         assert_close_to_literal!(r, [-0.5, -1.0, f64::INFINITY, 1.0, 0.5]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/recip/recip.cu b/src/tensor_ops/recip/recip.cu
index c40fa7871..3f1b3bb99 100644
--- a/src/tensor_ops/recip/recip.cu
+++ b/src/tensor_ops/recip/recip.cu
@@ -2,14 +2,20 @@
 
 struct RecipKernelOp {};
 
+UNARY_OP(
+    __half, recip_fwd_f16, recip_bwd_f16, RecipKernelOp,
+    recipg(x),
+    -y * y
+)
+
 UNARY_OP(
     float, recip_fwd_f32, recip_bwd_f32, RecipKernelOp,
-    1 / x,
+    recipg(x),
     -y * y
 )
 
 UNARY_OP(
     double, recip_fwd_f64, recip_bwd_f64, RecipKernelOp,
-    1 / x,
+    recipg(x),
     -y * y
 )
diff --git a/src/tensor_ops/relu/cuda_kernel.rs b/src/tensor_ops/relu/cuda_kernel.rs
index f4d00f633..13a6fc80c 100644
--- a/src/tensor_ops/relu/cuda_kernel.rs
+++ b/src/tensor_ops/relu/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for ReLUKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/relu.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(ReLUKernelOp, half::f16, PTX, "relu_fwd_f16", "relu_bwd_f16");
 cuda_unary!(ReLUKernelOp, f32, PTX, "relu_fwd_f32", "relu_bwd_f32");
 cuda_unary!(ReLUKernelOp, f64, PTX, "relu_fwd_f64", "relu_bwd_f64");
diff --git a/src/tensor_ops/relu/mod.rs b/src/tensor_ops/relu/mod.rs
index cc7952a39..c1ac134db 100644
--- a/src/tensor_ops/relu/mod.rs
+++ b/src/tensor_ops/relu/mod.rs
@@ -46,7 +46,9 @@ mod tests {
     #[test]
     fn test_relu() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().relu();
         assert_close_to_literal!(r, [0.0, 0.0, 0.0, 1.0, 2.0]);
         // NOTE: call .exp() to make sure we cover cases where .relu() uses the result's gradient
diff --git a/src/tensor_ops/relu/relu.cu b/src/tensor_ops/relu/relu.cu
index 079eeda3d..33dcbd20e 100644
--- a/src/tensor_ops/relu/relu.cu
+++ b/src/tensor_ops/relu/relu.cu
@@ -2,11 +2,27 @@
 
 struct ReLUKernelOp {};
 
+template<typename T>
+__device__ __forceinline__ T relu_fwd(T x) {
+    T zero = 0.0;
+    return maxg(x, zero);
+}
+
+template<typename T>
+__device__ __forceinline__ T relu_bwd(T x) {
+    T zero = 0.0;
+    T one = 1.0;
+    return x > zero ? one : zero;
+}
+
+UNARY_OP(__half, relu_fwd_f16, relu_bwd_f16, ReLUKernelOp,
+        relu_fwd(x),
+        relu_bwd(x))
+
 UNARY_OP(float, relu_fwd_f32, relu_bwd_f32, ReLUKernelOp,
-        fmaxf(x, 0.0),
-        x > 0.0 ? 1.0 : 0.0)
+        relu_fwd(x),
+        relu_bwd(x))
 
 UNARY_OP(double, relu_fwd_f64, relu_bwd_f64, ReLUKernelOp,
-        fmax(x, 0.0),
-        x > 0.0 ? 1.0 : 0.0)
-        
\ No newline at end of file
+        relu_fwd(x),
+        relu_bwd(x))
diff --git a/src/tensor_ops/reshape_to/cuda_kernel.rs b/src/tensor_ops/reshape_to/cuda_kernel.rs
index 53041bf2a..082234f39 100644
--- a/src/tensor_ops/reshape_to/cuda_kernel.rs
+++ b/src/tensor_ops/reshape_to/cuda_kernel.rs
@@ -21,6 +21,10 @@ impl<E: Dtype + CudaTypeName> super::ReshapeKernel<E> for Cuda {
             let src = FWD_KERNEL.replace("$T", E::NAME);
             let opts = CompileOptions {
                 arch: Some(env!("CUDA_COMPUTE_CAP")),
+                include_paths: vec![
+                    env!("CUDA_INCLUDE_DIR").to_string(),
+                    env!("OUT_DIR").to_string(),
+                ],
                 ..Default::default()
             };
             let ptx = compile_ptx_with_opts(src, opts).unwrap();
@@ -64,6 +68,10 @@ impl<E: Dtype + CudaTypeName> super::ReshapeKernel<E> for Cuda {
             let src = BWD_KERNEL.replace("$T", E::NAME);
             let opts = CompileOptions {
                 arch: Some(env!("CUDA_COMPUTE_CAP")),
+                include_paths: vec![
+                    env!("CUDA_INCLUDE_DIR").to_string(),
+                    env!("OUT_DIR").to_string(),
+                ],
                 ..Default::default()
             };
             let ptx = compile_ptx_with_opts(src, opts).unwrap();
@@ -101,20 +109,7 @@ typedef long int intptr_t;
 typedef int intptr_t;
 #endif
 
-__device__ unsigned int get_strided_index(
-    unsigned int idx,
-    const size_t num_dims,
-    const size_t *dims,
-    const size_t *strides
-) {
-    unsigned int strided_i = 0;
-    for (unsigned int d = 0; d < num_dims; d++) {
-        unsigned int dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
+#include \"cuda_utils.cuh\"
 
 extern \"C\" __global__ void reshape_fwd(
     const size_t numel,
@@ -148,20 +143,7 @@ typedef long int intptr_t;
 typedef int intptr_t;
 #endif
 
-__device__ unsigned int get_strided_index(
-    unsigned int idx,
-    const size_t num_dims,
-    const size_t *dims,
-    const size_t *strides
-) {
-    unsigned int strided_i = 0;
-    for (unsigned int d = 0; d < num_dims; d++) {
-        unsigned int dim_idx = num_dims - 1 - d;
-        strided_i += (idx % dims[dim_idx]) * strides[dim_idx];
-        idx /= dims[dim_idx];
-    }
-    return strided_i;
-}
+#include \"cuda_utils.cuh\"
 
 extern \"C\" __global__ void reshape_bwd(
     const size_t numel,
diff --git a/src/tensor_ops/reshape_to/mod.rs b/src/tensor_ops/reshape_to/mod.rs
index 15da6eb2f..d8e5e797d 100644
--- a/src/tensor_ops/reshape_to/mod.rs
+++ b/src/tensor_ops/reshape_to/mod.rs
@@ -179,7 +179,9 @@ mod tests {
     #[test]
     fn test_1d_reshape() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]);
+        let a = dev
+            .tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
+            .to_dtype::<TestDtype>();
         let b = a.leaky_trace().reshape::<Rank2<2, 3>>();
         assert_close_to_literal!(b, [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]);
         let g = b.exp().mean().backward();
@@ -192,7 +194,9 @@ mod tests {
     #[test]
     fn test_1d_reshape_non_contiguous() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]);
+        let a = dev
+            .tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
+            .to_dtype::<TestDtype>();
         let b = a
             .leaky_trace()
             .permute::<Rank2<3, 2>, _>()
@@ -211,7 +215,8 @@ mod tests {
     #[test]
     fn test_reshape_broadcasted() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<Rank2<2, 3>, TestDtype, _> = dev.tensor([1., 2., 3.]).broadcast();
+        let a: Tensor<Rank2<2, 3>, TestDtype, _> =
+            dev.tensor([1., 2., 3.]).to_dtype::<TestDtype>().broadcast();
         let b: Tensor<Rank2<3, 2>, TestDtype, _> = a.clone().reshape();
 
         #[cfg(feature = "cuda")]
@@ -219,19 +224,21 @@ mod tests {
 
         assert_eq!(b.data.len(), 6);
         assert_eq!(a.as_vec(), b.as_vec());
-        assert_eq!(b.array(), [[1., 2.], [3., 1.], [2., 3.]]);
+        assert_close_to_literal!(b, [[1., 2.], [3., 1.], [2., 3.]]);
     }
 
     #[test]
     fn test_contiguous() {
         let dev: TestDevice = Default::default();
 
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]);
+        let a = dev
+            .tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
+            .to_dtype::<TestDtype>();
 
         let b1 = a.clone().contiguous();
         assert_eq!(a.strides, b1.strides);
 
-        let b2: Tensor<_, TestDtype, _> = a.permute::<Rank2<3, 2>, _>().contiguous();
+        let b2 = a.permute::<Rank2<3, 2>, _>().contiguous();
         assert_eq!(b2.strides, [2, 1]);
     }
 }
diff --git a/src/tensor_ops/roll/cuda_kernel.rs b/src/tensor_ops/roll/cuda_kernel.rs
index 92667463a..2592fa372 100644
--- a/src/tensor_ops/roll/cuda_kernel.rs
+++ b/src/tensor_ops/roll/cuda_kernel.rs
@@ -12,6 +12,10 @@ const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/roll.ptx"));
 trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const FNS: &'static [&'static str] = &["roll_fwd_f16", "roll_bwd_f16"];
+}
 impl HasCudaKernel<f32> for Cuda {
     const FNS: &'static [&'static str] = &["roll_fwd_f32", "roll_bwd_f32"];
 }
diff --git a/src/tensor_ops/roll/mod.rs b/src/tensor_ops/roll/mod.rs
index 59ff4d5a1..67dd3bff5 100644
--- a/src/tensor_ops/roll/mod.rs
+++ b/src/tensor_ops/roll/mod.rs
@@ -93,7 +93,9 @@ mod tests {
     #[test]
     fn test_roll_3d_axis_2() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<Rank1<5>, TestDtype, _> = dev.tensor([-0.3, -0.15, 0.0, 0.15, 0.2]);
+        let t = dev
+            .tensor([-0.3, -0.15, 0.0, 0.15, 0.2])
+            .to_dtype::<TestDtype>();
         let y = t
             .leaky_trace()
             .broadcast::<Rank3<2, 3, 5>, _>()
@@ -109,7 +111,9 @@ mod tests {
     #[test]
     fn test_roll_3d_first_two_axes() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<Rank1<5>, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0, 4.0, 5.0]);
+        let t = dev
+            .tensor([1.0, 2.0, 3.0, 4.0, 5.0])
+            .to_dtype::<TestDtype>();
         let y0 = t
             .leaky_trace()
             .broadcast::<Rank3<2, 3, 5>, _>()
diff --git a/src/tensor_ops/roll/roll.cu b/src/tensor_ops/roll/roll.cu
index af810ed75..375e73b32 100644
--- a/src/tensor_ops/roll/roll.cu
+++ b/src/tensor_ops/roll/roll.cu
@@ -105,5 +105,6 @@ extern "C" __global__ void BWD( \
     const TY *grad_out \
 ) { roll_bwd(op, num_dims, numel, dims, inp_strides, out_strides, grad_inp, grad_out); }
 
+ROLL(__half, roll_fwd_f16, roll_bwd_f16);
 ROLL(float, roll_fwd_f32, roll_bwd_f32);
 ROLL(double, roll_fwd_f64, roll_bwd_f64);
diff --git a/src/tensor_ops/select_and_gather/cuda_kernel.rs b/src/tensor_ops/select_and_gather/cuda_kernel.rs
index 5f1997644..0243a5ef9 100644
--- a/src/tensor_ops/select_and_gather/cuda_kernel.rs
+++ b/src/tensor_ops/select_and_gather/cuda_kernel.rs
@@ -187,6 +187,17 @@ macro_rules! impl_cuda_kernels {
     };
 }
 
+#[cfg(feature = "f16")]
+impl_cuda_kernels!(
+    half::f16,
+    "gather_f16",
+    "gather_fwd_f16",
+    "gather_bwd_f16",
+    "select_f16",
+    "select_fwd_f16",
+    "select_bwd_f16"
+);
+
 impl_cuda_kernels!(
     f32,
     "gather_f32",
diff --git a/src/tensor_ops/select_and_gather/gather.cu b/src/tensor_ops/select_and_gather/gather.cu
index 1e9b369d3..747c033a4 100644
--- a/src/tensor_ops/select_and_gather/gather.cu
+++ b/src/tensor_ops/select_and_gather/gather.cu
@@ -127,5 +127,6 @@ extern "C" __global__ void BWD( \
     gather_bwd(numel, grad_inp, inp_num_dims, inp_dims, inp_strides, idx, idx_num_dims, idx_dims, idx_strides, grad_out, out_num_dims); \
 }
 
+GATHER(__half, gather_fwd_f16, gather_bwd_f16);
 GATHER(float, gather_fwd_f32, gather_bwd_f32);
 GATHER(double, gather_fwd_f64, gather_bwd_f64);
diff --git a/src/tensor_ops/select_and_gather/mod.rs b/src/tensor_ops/select_and_gather/mod.rs
index 104a47ccb..fe7d0b41b 100644
--- a/src/tensor_ops/select_and_gather/mod.rs
+++ b/src/tensor_ops/select_and_gather/mod.rs
@@ -194,8 +194,7 @@ impl<Src: Shape, E: Dtype, D: ReplaceDimKernel<E>, T: Tape<E, D>> GatherTo<D>
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tensor_ops::*;
-    use crate::tests::*;
+    use crate::{tensor_ops::*, tests::*};
 
     #[test]
     #[should_panic = "dimension 0 not the same"]
@@ -269,7 +268,9 @@ mod tests {
         let t_array = t.array();
         assert_eq!(r.array(), t_array[0]);
         let g = r.exp().backward();
-        assert_eq!(g.get(&t).array(), [t_array[0].exp(), 0.0, 0.0, 0.0, 0.0]);
+        let mut expected = [TestDtype::zero(); 5];
+        expected[0] = t_array[0].exp();
+        assert_eq!(g.get(&t).array(), expected);
     }
 
     #[test]
@@ -284,10 +285,10 @@ mod tests {
             g.get(&t).array(),
             [
                 t_array[0].exp(),
-                2.0 * (t_array[1]).exp(),
-                0.0,
+                t_array[1].exp() + t_array[1].exp(),
+                TestDtype::zero(),
                 t_array[3].exp(),
-                0.0
+                TestDtype::zero()
             ]
         );
     }
@@ -323,7 +324,9 @@ mod tests {
     #[test]
     fn test_select_2d_axis_0() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().select(dev.tensor(0));
         assert_close_to_literal!(r, [1.0, 2.0, 3.0]);
         let g = r.mean().backward();
@@ -333,7 +336,9 @@ mod tests {
     #[test]
     fn test_select_2d_axis_1() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().select(dev.tensor([1, 1]));
         assert_close_to_literal!(r, [2.0, -2.0]);
         let g = r.mean().backward();
@@ -343,7 +348,7 @@ mod tests {
     #[test]
     fn test_select_2d_broadcasted() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
+        let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
         let r = t
             .leaky_trace()
             .broadcast::<Rank2<2, 3>, _>()
@@ -356,7 +361,7 @@ mod tests {
     #[test]
     fn test_gather_2d_broadcasted() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
+        let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
         let idx: Tensor<Rank2<2, 2>, usize, _> = dev.tensor([[0, 1], [1, 2]]);
         let r: Tensor<Rank2<2, 2>, _, _, _> =
             t.leaky_trace().broadcast::<Rank2<2, 3>, _>().gather(idx);
@@ -391,12 +396,10 @@ mod tests {
         let g = r.exp().mean().backward();
         let sub_g = dev.tensor(sub_t).exp() / 8.0;
         let sub_g = sub_g.array();
+        let z = TestDtype::zero();
         assert_close!(
             g.get(&t).array(),
-            [
-                [[0.0; 4], sub_g[0], [0.0; 4]],
-                [[0.0; 4], [0.0; 4], sub_g[1]],
-            ]
+            [[[z; 4], sub_g[0], [z; 4]], [[z; 4], [z; 4], sub_g[1]],]
         );
     }
 
@@ -414,18 +417,19 @@ mod tests {
         let g = r.exp().mean().backward();
         let sub_g = dev.tensor(sub_t).exp() / 6.0;
         let sub_g = sub_g.array();
+        let z = TestDtype::zero();
         assert_close!(
             g.get(&t).array(),
             [
                 [
-                    [0.0, 0.0, sub_g[0][0], 0.0],
-                    [0.0, 0.0, 0.0, sub_g[0][1]],
-                    [0.0, 0.0, sub_g[0][2], 0.0],
+                    [z, z, sub_g[0][0], z],
+                    [z, z, z, sub_g[0][1]],
+                    [z, z, sub_g[0][2], z],
                 ],
                 [
-                    [0.0, sub_g[1][0], 0.0, 0.0],
-                    [0.0, sub_g[1][1], 0.0, 0.0],
-                    [sub_g[1][2], 0.0, 0.0, 0.0],
+                    [z, sub_g[1][0], z, z],
+                    [z, sub_g[1][1], z, z],
+                    [sub_g[1][2], z, z, z],
                 ],
             ]
         );
diff --git a/src/tensor_ops/select_and_gather/select.cu b/src/tensor_ops/select_and_gather/select.cu
index 3f790e88a..21242c0d6 100644
--- a/src/tensor_ops/select_and_gather/select.cu
+++ b/src/tensor_ops/select_and_gather/select.cu
@@ -117,5 +117,6 @@ extern "C" __global__ void BWD( \
     select_bwd(numel, grad_inp, inp_num_dims, inp_dims, inp_strides, idx, idx_num_dims, idx_dims, idx_strides, grad_out, out_dims, out_strides); \
 }
 
+SELECT(__half, select_fwd_f16, select_bwd_f16);
 SELECT(float, select_fwd_f32, select_bwd_f32);
 SELECT(double, select_fwd_f64, select_bwd_f64)
diff --git a/src/tensor_ops/sigmoid/cuda_kernel.rs b/src/tensor_ops/sigmoid/cuda_kernel.rs
index b379f47df..6d3b55110 100644
--- a/src/tensor_ops/sigmoid/cuda_kernel.rs
+++ b/src/tensor_ops/sigmoid/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for Sigmoid {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/sigmoid.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(df(f(x)) Sigmoid, half::f16, PTX, "sigmoid_fwd_f16", "sigmoid_bwd_f16");
 cuda_unary!(df(f(x)) Sigmoid, f32, PTX, "sigmoid_fwd_f32", "sigmoid_bwd_f32");
 cuda_unary!(df(f(x)) Sigmoid, f64, PTX, "sigmoid_fwd_f64", "sigmoid_bwd_f64");
diff --git a/src/tensor_ops/sigmoid/mod.rs b/src/tensor_ops/sigmoid/mod.rs
index d789c8afe..6e3a0661c 100644
--- a/src/tensor_ops/sigmoid/mod.rs
+++ b/src/tensor_ops/sigmoid/mod.rs
@@ -45,7 +45,9 @@ mod tests {
     #[test]
     fn test_sigmoid() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().sigmoid();
         assert_close_to_literal!(r, [0.11920292, 0.26894143, 0.5, 0.7310586, 0.880797]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/sigmoid/sigmoid.cu b/src/tensor_ops/sigmoid/sigmoid.cu
index d867b7773..0f9a8882b 100644
--- a/src/tensor_ops/sigmoid/sigmoid.cu
+++ b/src/tensor_ops/sigmoid/sigmoid.cu
@@ -1,15 +1,28 @@
 #include "unary_op_macros.cuh"
 
-#define SIGMOID_f32(X) (1.0 / (1.0 + expf(-X))) 
-#define SIGMOID_f64(X) (1.0 / (1.0 + exp(-X))) 
-
 struct SigmoidKernelOp {};
 
+template<typename T>
+__device__ __forceinline__ T sigmoid_fwd(T x) {
+    T one = 1.0;
+    return one / (one + expg(-x));
+}
+
+template<typename T>
+__device__ __forceinline__ T sigmoid_bwd(T y) {
+    T one = 1.0;
+    return y * (one - y);
+}
+
+UNARY_OP(__half, sigmoid_fwd_f16, sigmoid_bwd_f16, SigmoidKernelOp,
+        sigmoid_fwd(x),
+        sigmoid_bwd(y))
+
 UNARY_OP(float, sigmoid_fwd_f32, sigmoid_bwd_f32, SigmoidKernelOp,
-        SIGMOID_f32(x),
-        y * (1.0 - y))
+        sigmoid_fwd(x),
+        sigmoid_bwd(y))
 
 UNARY_OP(double, sigmoid_fwd_f64, sigmoid_bwd_f64, SigmoidKernelOp,
-        SIGMOID_f64(x),
-        y * (1.0 - y))
+        sigmoid_fwd(x),
+        sigmoid_bwd(y))
         
\ No newline at end of file
diff --git a/src/tensor_ops/sin/cuda_kernel.rs b/src/tensor_ops/sin/cuda_kernel.rs
index 97af74db2..9fd33010c 100644
--- a/src/tensor_ops/sin/cuda_kernel.rs
+++ b/src/tensor_ops/sin/cuda_kernel.rs
@@ -4,5 +4,13 @@ unsafe impl cudarc::driver::DeviceRepr for super::SinKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/sin.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    super::SinKernelOp,
+    half::f16,
+    PTX,
+    "sin_fwd_f16",
+    "sin_bwd_f16"
+);
 cuda_unary!(super::SinKernelOp, f32, PTX, "sin_fwd_f32", "sin_bwd_f32");
 cuda_unary!(super::SinKernelOp, f64, PTX, "sin_fwd_f64", "sin_bwd_f64");
diff --git a/src/tensor_ops/sin/mod.rs b/src/tensor_ops/sin/mod.rs
index 2fa7d9334..5a3fe2f05 100644
--- a/src/tensor_ops/sin/mod.rs
+++ b/src/tensor_ops/sin/mod.rs
@@ -46,7 +46,9 @@ mod tests {
     #[test]
     fn test_sin() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().sin();
         assert_close_to_literal!(r, [-0.9092974, -0.84147096, 0.0, 0.84147096, 0.9092974]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/sin/sin.cu b/src/tensor_ops/sin/sin.cu
index 1110c6884..168fc85b8 100644
--- a/src/tensor_ops/sin/sin.cu
+++ b/src/tensor_ops/sin/sin.cu
@@ -2,11 +2,15 @@
 
 struct SinKernelOp {};
 
+UNARY_OP(__half, sin_fwd_f16, sin_bwd_f16, SinKernelOp,
+        sing(x),
+        cosg(x))
+
 UNARY_OP(float, sin_fwd_f32, sin_bwd_f32, SinKernelOp,
-        sinf(x),
-        cosf(x))
+        sing(x),
+        cosg(x))
 
 UNARY_OP(double, sin_fwd_f64, sin_bwd_f64, SinKernelOp,
-        sin(x),
-        cos(x))
+        sing(x),
+        cosg(x))
         
\ No newline at end of file
diff --git a/src/tensor_ops/slice/cuda_kernel.rs b/src/tensor_ops/slice/cuda_kernel.rs
index 88db9284f..58f9a681e 100644
--- a/src/tensor_ops/slice/cuda_kernel.rs
+++ b/src/tensor_ops/slice/cuda_kernel.rs
@@ -25,6 +25,12 @@ macro_rules! has_kernels {
 
 has_kernels!(u8, u16, u32, u64, usize, i8, i16, i32, i64, isize, bool);
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "slice_f16";
+    const FNS: &'static [&'static str] = &["slice_fwd_f16", "slice_bwd_f16"];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "slice_f32";
     const FNS: &'static [&'static str] = &["slice_fwd_f32", "slice_bwd_f32"];
diff --git a/src/tensor_ops/slice/mod.rs b/src/tensor_ops/slice/mod.rs
index 09adf7190..b5b172a2c 100644
--- a/src/tensor_ops/slice/mod.rs
+++ b/src/tensor_ops/slice/mod.rs
@@ -89,12 +89,14 @@ mod tests {
     #[test]
     fn test_slice() {
         let dev = TestDevice::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([
-            [1., 2., 3., 4.],
-            [5., 6., 7., 8.],
-            [9., 10., 11., 12.],
-            [13., 14., 15., 16.],
-        ]);
+        let a = dev
+            .tensor([
+                [1., 2., 3., 4.],
+                [5., 6., 7., 8.],
+                [9., 10., 11., 12.],
+                [13., 14., 15., 16.],
+            ])
+            .to_dtype::<TestDtype>();
 
         let b: Tensor<Rank2<2, 2>, _, _> = a.clone().slice((2.., 2..)).realize().unwrap();
         assert_close_to_literal!(b, [[11., 12.], [15., 16.]]);
@@ -124,7 +126,10 @@ mod tests {
     #[test]
     fn test_slice_broadcast_top() {
         let dev = TestDevice::default();
-        let a: Tensor<Rank2<5, 4>, TestDtype, _> = dev.tensor([1., 2., 3., 4.]).broadcast();
+        let a = dev
+            .tensor([1., 2., 3., 4.])
+            .to_dtype::<TestDtype>()
+            .broadcast::<Rank2<5, 4>, _>();
 
         let b: Tensor<Rank2<3, 4>, _, _> = a.clone().slice((..3, ..)).realize().unwrap();
         assert_close_to_literal!(b, [[1., 2., 3., 4.]; 3]);
@@ -142,7 +147,10 @@ mod tests {
     #[test]
     fn test_slice_broadcast_bottom() {
         let dev = TestDevice::default();
-        let a: Tensor<Rank2<4, 5>, TestDtype, _> = dev.tensor([1., 2., 3., 4.]).broadcast();
+        let a: Tensor<Rank2<4, 5>, TestDtype, _> = dev
+            .tensor([1., 2., 3., 4.])
+            .to_dtype::<TestDtype>()
+            .broadcast();
 
         let b: Tensor<Rank2<2, 5>, _, _> = a.clone().slice((1..3, ..)).realize().unwrap();
         assert_close_to_literal!(b, [[2.; 5], [3.; 5]]);
@@ -160,12 +168,14 @@ mod tests {
     #[test]
     fn test_slice_backward() {
         let dev = TestDevice::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([
-            [1., 2., 3., 4.],
-            [5., 6., 7., 8.],
-            [9., 10., 11., 12.],
-            [13., 14., 15., 16.],
-        ]);
+        let a = dev
+            .tensor([
+                [1., 2., 3., 4.],
+                [5., 6., 7., 8.],
+                [9., 10., 11., 12.],
+                [13., 14., 15., 16.],
+            ])
+            .to_dtype::<TestDtype>();
 
         let b: Tensor<Rank2<2, 2>, _, _, _> = a.leaky_trace().slice((2.., 2..)).realize().unwrap();
         assert_close_to_literal!(b, [[11., 12.], [15., 16.]]);
diff --git a/src/tensor_ops/slice/slice.cu b/src/tensor_ops/slice/slice.cu
index 5a520cc25..cb4f8325a 100644
--- a/src/tensor_ops/slice/slice.cu
+++ b/src/tensor_ops/slice/slice.cu
@@ -68,6 +68,7 @@ extern "C" __global__ void BWD( \
     slice_bwd(numel, num_dims, dims, strides, offset, grad_inp, grad_out); \
 }
 
+SLICE(__half, slice_fwd_f16, slice_bwd_f16);
 SLICE(float, slice_fwd_f32, slice_bwd_f32);
 SLICE(double, slice_fwd_f64, slice_bwd_f64);
 SLICE_FWD(uint8_t, slice_fwd_u8);
diff --git a/src/tensor_ops/softmax.rs b/src/tensor_ops/softmax.rs
index 125f5a0d4..18cf28721 100644
--- a/src/tensor_ops/softmax.rs
+++ b/src/tensor_ops/softmax.rs
@@ -113,13 +113,17 @@ mod tests {
     #[test]
     fn test_softmax_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let a = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().softmax();
         assert_close_to_literal!(
             r,
             [0.011656232, 0.031684924, 0.086128555, 0.23412168, 0.6364087]
         );
-        let l = r * dev.tensor([0.0, 0.0, 1.0, 0.0, 0.0]);
+        let l = r * dev
+            .tensor([0.0, 0.0, 1.0, 0.0, 0.0])
+            .to_dtype::<TestDtype>();
         assert_close_to_literal!(l, [0.0, 0.0, 0.086128555, 0.0, 0.0]);
         let g = l.mean().backward();
         assert_close_to_literal!(
@@ -137,7 +141,9 @@ mod tests {
     #[test]
     fn test_softmax_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]);
+        let a = dev
+            .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().softmax::<Axis<1>>();
         assert_close_to_literal!(
             r,
@@ -146,7 +152,9 @@ mod tests {
                 [0.002355633, 0.047314156, 0.9503302],
             ]
         );
-        let l = r * dev.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]);
+        let l = r * dev
+            .tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
+            .to_dtype::<TestDtype>();
         assert_close_to_literal!(l, [[0.09003058, 0.0, 0.0], [0.0, 0.047314156, 0.0]]);
         let g = l.mean().backward();
         assert_close_to_literal!(
@@ -161,7 +169,9 @@ mod tests {
     #[test]
     fn test_softmax_2d_0th_axis() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]);
+        let a = dev
+            .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]])
+            .to_dtype::<TestDtype>();
         let r = a.leaky_trace().softmax::<Axis<0>>();
         assert_close_to_literal!(
             r,
@@ -170,7 +180,9 @@ mod tests {
                 [0.95257413, 0.9933072, 0.9990892],
             ]
         );
-        let l = r * dev.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]);
+        let l = r * dev
+            .tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
+            .to_dtype::<TestDtype>();
         assert_close_to_literal!(l, [[0.047425874, 0.0, 0.0], [0.0, 0.9933072, 0.0]]);
         let g = l.mean().backward();
         assert_close_to_literal!(
diff --git a/src/tensor_ops/sqrt/cuda_kernel.rs b/src/tensor_ops/sqrt/cuda_kernel.rs
index 9990a67b4..6bd0ea39c 100644
--- a/src/tensor_ops/sqrt/cuda_kernel.rs
+++ b/src/tensor_ops/sqrt/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for SqrtKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/sqrt.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(df(f(x)) SqrtKernelOp, half::f16, PTX, "sqrt_fwd_f16", "sqrt_bwd_f16");
 cuda_unary!(df(f(x)) SqrtKernelOp, f32, PTX, "sqrt_fwd_f32", "sqrt_bwd_f32");
 cuda_unary!(df(f(x)) SqrtKernelOp, f64, PTX, "sqrt_fwd_f64", "sqrt_bwd_f64");
diff --git a/src/tensor_ops/sqrt/mod.rs b/src/tensor_ops/sqrt/mod.rs
index 254b89885..bbe03cb9a 100644
--- a/src/tensor_ops/sqrt/mod.rs
+++ b/src/tensor_ops/sqrt/mod.rs
@@ -45,13 +45,24 @@ mod tests {
     #[test]
     fn test_sqrt() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-1.0, 0.0, 1.0, 4.0]);
+        let x = dev.tensor([-1.0, 0.0, 1.0, 4.0]).to_dtype::<TestDtype>();
         let r = x.leaky_trace().sqrt();
-        assert!(r.array()[0].is_nan());
-        assert_eq!(r.array()[1..], [0.0, 1.0, 2.0]);
+        let r_array = r.array();
+        assert!(r_array[0].is_nan());
+        assert_eq!(
+            &r_array[1..],
+            [0.0, 1.0, 2.0]
+                .map(NumCast::from)
+                .map(Option::<TestDtype>::unwrap)
+        );
         let g = r.mean().backward();
         let g = g.get(&x).array();
         assert!(g[0].is_nan());
-        assert_eq!(g[1..], [TestDtype::INFINITY, 0.5 / 4.0, 0.25 / 4.0]);
+        assert_eq!(
+            &g[1..],
+            [f64::INFINITY, 0.5 / 4.0, 0.25 / 4.0]
+                .map(NumCast::from)
+                .map(Option::<TestDtype>::unwrap)
+        );
     }
 }
diff --git a/src/tensor_ops/sqrt/sqrt.cu b/src/tensor_ops/sqrt/sqrt.cu
index f3db1c0fe..21e87ac5f 100644
--- a/src/tensor_ops/sqrt/sqrt.cu
+++ b/src/tensor_ops/sqrt/sqrt.cu
@@ -2,11 +2,15 @@
 
 struct SqrtKernelOp {};
 
+UNARY_OP(__half, sqrt_fwd_f16, sqrt_bwd_f16, SqrtKernelOp,
+        sqrtg(x),
+        recipg(y + y))
+
 UNARY_OP(float, sqrt_fwd_f32, sqrt_bwd_f32, SqrtKernelOp,
-        sqrtf(x),
-        1 / (y + y))
+        sqrtg(x),
+        recipg(y + y))
 
 UNARY_OP(double, sqrt_fwd_f64, sqrt_bwd_f64, SqrtKernelOp,
-        sqrt(x),
-        1 / (y + y))
+        sqrtg(x),
+        recipg(y + y))
         
\ No newline at end of file
diff --git a/src/tensor_ops/square/cuda_kernel.rs b/src/tensor_ops/square/cuda_kernel.rs
index b85cef0cc..4f8a887a3 100644
--- a/src/tensor_ops/square/cuda_kernel.rs
+++ b/src/tensor_ops/square/cuda_kernel.rs
@@ -5,5 +5,13 @@ unsafe impl cudarc::driver::DeviceRepr for SquareKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/square.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(
+    SquareKernelOp,
+    half::f16,
+    PTX,
+    "square_fwd_f16",
+    "square_bwd_f16"
+);
 cuda_unary!(SquareKernelOp, f32, PTX, "square_fwd_f32", "square_bwd_f32");
 cuda_unary!(SquareKernelOp, f64, PTX, "square_fwd_f64", "square_bwd_f64");
diff --git a/src/tensor_ops/square/mod.rs b/src/tensor_ops/square/mod.rs
index 36838290d..d7361b12b 100644
--- a/src/tensor_ops/square/mod.rs
+++ b/src/tensor_ops/square/mod.rs
@@ -45,7 +45,9 @@ mod tests {
     #[test]
     fn test_square() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().square();
         assert_close_to_literal!(r, [4.0, 1.0, 0.0, 1.0, 4.0]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/square/square.cu b/src/tensor_ops/square/square.cu
index a272451da..85318f107 100644
--- a/src/tensor_ops/square/square.cu
+++ b/src/tensor_ops/square/square.cu
@@ -2,11 +2,15 @@
 
 struct SquareKernelOp {};
 
+UNARY_OP(__half, square_fwd_f16, square_bwd_f16, SquareKernelOp,
+        x * x,
+        x + x)
+
 UNARY_OP(float, square_fwd_f32, square_bwd_f32, SquareKernelOp,
         x * x,
-        2.0 * x)
+        x + x)
 
 UNARY_OP(double, square_fwd_f64, square_bwd_f64, SquareKernelOp,
         x * x,
-        2.0 * x)
+        x + x)
         
\ No newline at end of file
diff --git a/src/tensor_ops/stack/cuda_kernel.rs b/src/tensor_ops/stack/cuda_kernel.rs
index b3ecf10a9..a3508f502 100644
--- a/src/tensor_ops/stack/cuda_kernel.rs
+++ b/src/tensor_ops/stack/cuda_kernel.rs
@@ -60,6 +60,7 @@ impl<E: Dtype + CudaTypeName> super::StackKernel<E> for Cuda {
             let src = BWD_KERNEL.replace("$Ty", E::NAME);
             let opts = CompileOptions {
                 arch: Some(env!("CUDA_COMPUTE_CAP")),
+                include_paths: vec![env!("CUDA_INCLUDE_DIR").to_string()],
                 ..Default::default()
             };
             let ptx = compile_ptx_with_opts(src, opts).unwrap();
@@ -81,6 +82,7 @@ impl<E: Dtype + CudaTypeName> super::StackKernel<E> for Cuda {
 }
 
 const BWD_KERNEL: &str = "
+#include \"cuda_fp16.h\"
 extern \"C\" __global__ void stack_bwd(const size_t numel, const $Ty *inp, $Ty *out) {
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < numel) { out[i] += inp[i]; }
diff --git a/src/tensor_ops/stddev_to.rs b/src/tensor_ops/stddev_to.rs
index 5b7c03da1..5c2811c64 100644
--- a/src/tensor_ops/stddev_to.rs
+++ b/src/tensor_ops/stddev_to.rs
@@ -15,7 +15,7 @@ pub trait StddevTo<E: Dtype>: HasErr + HasShape {
     /// let r = t.stddev::<Rank1<2>, _>(0.0); // or `stddev::<_, Axis<1>>(0.0)`
     /// assert_eq!(r.array(), [0.6666667_f32.sqrt(), 6.0_f32.sqrt()]);
     /// ```
-    fn stddev<Dst: Shape, Ax: Axes>(self, epsilon: impl Into<E>) -> Self::WithShape<Dst>
+    fn stddev<Dst: Shape, Ax: Axes>(self, epsilon: impl Into<f64>) -> Self::WithShape<Dst>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>,
     {
@@ -24,7 +24,7 @@ pub trait StddevTo<E: Dtype>: HasErr + HasShape {
     /// Fallible version of [StddevTo::stddev]
     fn try_stddev<Dst: Shape, Ax: Axes>(
         self,
-        epsilon: impl Into<E>,
+        epsilon: impl Into<f64>,
     ) -> Result<Self::WithShape<Dst>, Self::Err>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>;
@@ -33,12 +33,14 @@ pub trait StddevTo<E: Dtype>: HasErr + HasShape {
 impl<S: Shape, E: Dtype, D: Device<E>, T: Tape<E, D>> StddevTo<E> for Tensor<S, E, D, T> {
     fn try_stddev<Dst: Shape, Ax: Axes>(
         self,
-        epsilon: impl Into<E>,
+        epsilon: impl Into<f64>,
     ) -> Result<Self::WithShape<Dst>, Self::Err>
     where
         Self::Shape: HasAxes<Ax> + ReduceShapeTo<Dst, Ax>,
     {
-        self.try_var()?.try_add(epsilon.into())?.try_sqrt()
+        self.try_var()?
+            .try_add(E::from_f64(epsilon.into()).unwrap())?
+            .try_sqrt()
     }
 }
 
@@ -50,7 +52,9 @@ mod tests {
     #[test]
     fn test_std_axis_0_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().stddev::<Rank1<4>, _>(1e-8);
         assert_close_to_literal!(r, [0.5, 0.0001, 1.0, 3.0]);
         let g = r.mean().backward();
@@ -63,7 +67,9 @@ mod tests {
     #[test]
     fn test_std_axis_1_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().stddev::<Rank1<2>, _>(0.0);
         assert_close_to_literal!(r, [1.118034, 3.7666297]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/sub/binary_sub.cu b/src/tensor_ops/sub/binary_sub.cu
index 2512bbd6d..44b93302d 100644
--- a/src/tensor_ops/sub/binary_sub.cu
+++ b/src/tensor_ops/sub/binary_sub.cu
@@ -2,6 +2,11 @@
 
 struct BinarySubKernelOp {};
 
+BINARY_OP(__half, bsub_fwd_f16, bsub_bwd_lhs_f16, bsub_bwd_rhs_f16, BinarySubKernelOp,
+    x - y,
+    1.0,
+    -1.0)
+
 BINARY_OP(float, bsub_fwd_f32, bsub_bwd_lhs_f32, bsub_bwd_rhs_f32, BinarySubKernelOp,
     x - y,
     1.0,
diff --git a/src/tensor_ops/sub/cuda_kernel.rs b/src/tensor_ops/sub/cuda_kernel.rs
index 033ab4150..60cb86ebd 100644
--- a/src/tensor_ops/sub/cuda_kernel.rs
+++ b/src/tensor_ops/sub/cuda_kernel.rs
@@ -1,6 +1,8 @@
 use super::{BinarySubKernelOp as Binary, ScalarSubKernelOp as Scalar};
 use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary};
 
+#[cfg(feature = "f16")]
+unsafe impl cudarc::driver::DeviceRepr for Scalar<half::f16> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f32> {}
 unsafe impl cudarc::driver::DeviceRepr for Scalar<f64> {}
 unsafe impl cudarc::driver::DeviceRepr for Binary {}
@@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {}
 const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_sub.ptx"));
 const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_sub.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(const_df() Scalar<half::f16>, half::f16, SCALAR_PTX, "ssub_fwd_f16", "ssub_bwd_f16");
 cuda_unary!(const_df() Scalar<f32>, f32, SCALAR_PTX, "ssub_fwd_f32", "ssub_bwd_f32");
 cuda_unary!(const_df() Scalar<f64>, f64, SCALAR_PTX, "ssub_fwd_f64", "ssub_bwd_f64");
+#[cfg(feature = "f16")]
+cuda_binary!(
+    const_df() Binary,
+    half::f16,
+    BINARY_PTX,
+    "bsub_fwd_f16",
+    "bsub_bwd_lhs_f16",
+    "bsub_bwd_rhs_f16"
+);
 cuda_binary!(
     const_df() Binary,
     f32,
diff --git a/src/tensor_ops/sub/mod.rs b/src/tensor_ops/sub/mod.rs
index a54c8bd70..30e4de5a4 100644
--- a/src/tensor_ops/sub/mod.rs
+++ b/src/tensor_ops/sub/mod.rs
@@ -69,6 +69,16 @@ impl<S: Shape, E: Dtype, D: UnaryKernel<ScalarSubKernelOp<E>, E>, T: Tape<E, D>>
     }
 }
 
+#[cfg(feature = "f16")]
+impl<S: Shape, D: UnaryKernel<ScalarSubKernelOp<half::f16>, half::f16>, T: Tape<half::f16, D>>
+    TrySub<f32> for Tensor<S, half::f16, D, T>
+{
+    fn try_sub(self, rhs: f32) -> Result<Self, Self::Err> {
+        let scalar = half::f16::from_f32(rhs);
+        try_unary_op(ScalarSubKernelOp { scalar }, self)
+    }
+}
+
 impl<S: Shape, E: Dtype, D: DeviceStorage, LTape: Tape<E, D>, Rhs> std::ops::Sub<Rhs>
     for Tensor<S, E, D, LTape>
 where
@@ -90,8 +100,8 @@ mod tests {
     fn test_sub_0d() {
         let dev: TestDevice = Default::default();
 
-        let a: Tensor<_, TestDtype, _> = dev.tensor(1.0);
-        let b: Tensor<_, TestDtype, _> = dev.tensor(1.0);
+        let a = dev.tensor(1.0).to_dtype::<TestDtype>();
+        let b = dev.tensor(1.0).to_dtype::<TestDtype>();
 
         let r = b.leaky_trace() - a.clone();
         assert_close_to_literal!(r, 0.0);
@@ -103,8 +113,8 @@ mod tests {
     #[test]
     fn test_sub_1d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
-        let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]);
+        let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
+        let b = dev.tensor([1.0, -1.0, 0.0]).to_dtype::<TestDtype>();
 
         let r = b.leaky_trace() - a.clone();
         assert_close_to_literal!(r, [0.0, -3.0, -3.0]);
@@ -116,10 +126,12 @@ mod tests {
     #[test]
     fn test_sub_2d() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]);
-        let b: Tensor<_, TestDtype, _> =
-            dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]);
+        let a = dev
+            .tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]])
+            .to_dtype::<TestDtype>();
+        let b = dev
+            .tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]])
+            .to_dtype::<TestDtype>();
 
         let r = b.leaky_trace() - a.clone();
         assert_close_to_literal!(r, [[-0.1371, 0.2136, 0.2259], [0.2601, -0.3328, -0.7954]]);
@@ -131,7 +143,7 @@ mod tests {
     #[test]
     fn test_scalar_sub_0d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor(0.0);
+        let x = dev.tensor(0.0).to_dtype::<TestDtype>();
         let r = x.leaky_trace() - 1.0;
         assert_close_to_literal!(r, -1.0);
         let g = r.exp().backward();
@@ -141,7 +153,7 @@ mod tests {
     #[test]
     fn test_scalar_sub_1d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]);
+        let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() - 1.0;
         assert_close_to_literal!(r, [-1.0, 0.0, 1.0]);
         let g = r.exp().sum().backward();
@@ -151,7 +163,7 @@ mod tests {
     #[test]
     fn test_scalar_sub_2d() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[0.0; 2]; 3]);
+        let x = dev.tensor([[0.0; 2]; 3]).to_dtype::<TestDtype>();
         let r = x.leaky_trace() - 1.0;
         assert_close_to_literal!(r, [[-1.0; 2]; 3]);
         let g = r.exp().sum().backward();
diff --git a/src/tensor_ops/sub/scalar_sub.cu b/src/tensor_ops/sub/scalar_sub.cu
index 67c334dc4..5dc49a334 100644
--- a/src/tensor_ops/sub/scalar_sub.cu
+++ b/src/tensor_ops/sub/scalar_sub.cu
@@ -5,6 +5,10 @@ struct ScalarSubKernelOp {
     F scalar;
 };
 
+UNARY_OP(__half, ssub_fwd_f16, ssub_bwd_f16, ScalarSubKernelOp<__half>,
+    x - op.scalar,
+    1.0);
+
 UNARY_OP(float, ssub_fwd_f32, ssub_bwd_f32, ScalarSubKernelOp<float>,
         x - op.scalar,
         1.0);
diff --git a/src/tensor_ops/sum_to/cuda_kernel.rs b/src/tensor_ops/sum_to/cuda_kernel.rs
index 69a8eda44..fc007b29a 100644
--- a/src/tensor_ops/sum_to/cuda_kernel.rs
+++ b/src/tensor_ops/sum_to/cuda_kernel.rs
@@ -15,6 +15,12 @@ trait HasCudaKernel<E> {
     const FNS: &'static [&'static str];
 }
 
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16> for Cuda {
+    const MOD: &'static str = "sum_f16";
+    const FNS: &'static [&'static str] = &["sum_to_fwd_f16", "sum_to_bwd_f16"];
+}
+
 impl HasCudaKernel<f32> for Cuda {
     const MOD: &'static str = "sum_f32";
     const FNS: &'static [&'static str] = &["sum_to_fwd_f32", "sum_to_bwd_f32"];
diff --git a/src/tensor_ops/sum_to/mod.rs b/src/tensor_ops/sum_to/mod.rs
index a000c32e1..095f8eaf9 100644
--- a/src/tensor_ops/sum_to/mod.rs
+++ b/src/tensor_ops/sum_to/mod.rs
@@ -86,7 +86,7 @@ mod tests {
     #[test]
     fn test_sum_1d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
+        let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
         let r = t.leaky_trace().sum::<Rank0, _>();
         let e = 6.0f64;
         assert_close_to_literal!(r, e);
@@ -98,7 +98,9 @@ mod tests {
     #[test]
     fn test_sum_axis_0_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().sum::<Rank1<3>, _>();
         let e = [-1.0f64, 6.0, -3.0];
         assert_close_to_literal!(r, e);
@@ -109,7 +111,9 @@ mod tests {
     #[test]
     fn test_sum_axis_1_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().sum::<Rank1<2>, _>();
         let e = [6.0f64, -4.0];
         assert_close_to_literal!(r, e);
@@ -144,7 +148,7 @@ mod tests {
     #[test]
     fn test_sum_chunking() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0; 100]; 60]);
+        let t = dev.tensor([[1.0; 100]; 60]).to_dtype::<TestDtype>();
         let r = t.leaky_trace().sum::<Rank1<60>, _>();
         assert_close_to_literal!(r, [100.0; 60]);
         let g = r.sum().backward();
@@ -154,7 +158,7 @@ mod tests {
     #[test]
     fn test_sum_reduce_to_more_than_physical_elements() {
         let dev: TestDevice = Default::default();
-        let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]);
+        let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::<TestDtype>();
         let b = a.broadcast::<Rank3<4, 3, 2>, _>();
         let c = b.sum::<Rank2<4, 3>, _>();
         assert_close_to_literal!(c, [[2.0, 4.0, 6.0]; 4]);
diff --git a/src/tensor_ops/sum_to/sum_to.cu b/src/tensor_ops/sum_to/sum_to.cu
index 776afa37d..d0c9c7f43 100644
--- a/src/tensor_ops/sum_to/sum_to.cu
+++ b/src/tensor_ops/sum_to/sum_to.cu
@@ -77,5 +77,6 @@ extern "C" __global__ void BWD( \
     sum_to_bwd(numel, num_dims, elems_per_thread, info, grad_inp, grad_out); \
 }
 
+SUM(__half, sum_to_fwd_f16, sum_to_bwd_f16);
 SUM(float, sum_to_fwd_f32, sum_to_bwd_f32);
 SUM(double, sum_to_fwd_f64, sum_to_bwd_f64);
diff --git a/src/tensor_ops/tanh/cuda_kernel.rs b/src/tensor_ops/tanh/cuda_kernel.rs
index a2e325110..c4ce875ae 100644
--- a/src/tensor_ops/tanh/cuda_kernel.rs
+++ b/src/tensor_ops/tanh/cuda_kernel.rs
@@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for TanhKernelOp {}
 
 const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/tanh.ptx"));
 
+#[cfg(feature = "f16")]
+cuda_unary!(df(f(x)) TanhKernelOp, half::f16, PTX, "tanh_fwd_f16", "tanh_bwd_f16");
 cuda_unary!(df(f(x)) TanhKernelOp, f32, PTX, "tanh_fwd_f32", "tanh_bwd_f32");
 cuda_unary!(df(f(x)) TanhKernelOp, f64, PTX, "tanh_fwd_f64", "tanh_bwd_f64");
diff --git a/src/tensor_ops/tanh/mod.rs b/src/tensor_ops/tanh/mod.rs
index 09d8e0317..09c67e53e 100644
--- a/src/tensor_ops/tanh/mod.rs
+++ b/src/tensor_ops/tanh/mod.rs
@@ -45,7 +45,9 @@ mod tests {
     #[test]
     fn test_tanh() {
         let dev: TestDevice = Default::default();
-        let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]);
+        let x = dev
+            .tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+            .to_dtype::<TestDtype>();
         let r = x.leaky_trace().tanh();
         assert_close_to_literal!(r, [-0.9640276, -0.7615942, 0., 0.7615942, 0.9640276]);
         let g = r.mean().backward();
diff --git a/src/tensor_ops/tanh/tanh.cu b/src/tensor_ops/tanh/tanh.cu
index b40716b30..0a50c1e01 100644
--- a/src/tensor_ops/tanh/tanh.cu
+++ b/src/tensor_ops/tanh/tanh.cu
@@ -2,11 +2,21 @@
 
 struct TanhKernelOp {};
 
+template<typename T>
+__device__ __forceinline__ T tanh_bwd(T y) {
+    T one = 1.0;
+    return one - y * y;
+}
+
+UNARY_OP(__half, tanh_fwd_f16, tanh_bwd_f16, TanhKernelOp,
+        tanhg(x),
+        tanh_bwd(y))
+
 UNARY_OP(float, tanh_fwd_f32, tanh_bwd_f32, TanhKernelOp,
-        tanhf(x),
-        1 - y * y)
+        tanhg(x),
+        tanh_bwd(y))
 
 UNARY_OP(double, tanh_fwd_f64, tanh_bwd_f64, TanhKernelOp,
-        tanh(x),
-        1 - y * y)
+        tanhg(x),
+        tanh_bwd(y))
         
\ No newline at end of file
diff --git a/src/tensor_ops/to_dtype/cuda_kernel.rs b/src/tensor_ops/to_dtype/cuda_kernel.rs
index ba4a2246a..b5597a422 100644
--- a/src/tensor_ops/to_dtype/cuda_kernel.rs
+++ b/src/tensor_ops/to_dtype/cuda_kernel.rs
@@ -4,7 +4,7 @@ use crate::{
 };
 use cudarc::{
     driver::{DeviceSlice, LaunchAsync},
-    nvrtc::compile_ptx,
+    nvrtc::{compile_ptx_with_opts, CompileOptions},
     types::CudaTypeName,
 };
 
@@ -14,6 +14,7 @@ typedef long int intptr_t;
 #else
 typedef int intptr_t;
 #endif
+#include \"cuda_fp16.h\"
 extern \"C\" __global__ void kernel(const size_t n, const $Src *inp, $Dst *out) {
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n) { out[i] = inp[i]; }
@@ -26,7 +27,12 @@ impl<E1: Unit + CudaTypeName, E2: Unit + CudaTypeName> super::ToDtypeKernel<E1,
 
         if !cuda.dev.has_func(&module, "kernel") {
             let src = KERNEL.replace("$Src", E1::NAME).replace("$Dst", E2::NAME);
-            let ptx = compile_ptx(src).unwrap();
+            let opts = CompileOptions {
+                arch: Some(env!("CUDA_COMPUTE_CAP")),
+                include_paths: vec![env!("CUDA_INCLUDE_DIR").to_string()],
+                ..Default::default()
+            };
+            let ptx = compile_ptx_with_opts(src, opts).unwrap();
             cuda.dev.load_ptx(ptx, &module, &["kernel"])?;
         }
 
diff --git a/src/tensor_ops/tri.rs b/src/tensor_ops/tri.rs
index 32b94c0a8..14d6b2381 100644
--- a/src/tensor_ops/tri.rs
+++ b/src/tensor_ops/tri.rs
@@ -74,15 +74,17 @@ mod tests {
     fn test_tri() {
         let dev: TestDevice = Default::default();
 
-        let t: Tensor<_, TestDtype, _> = dev.tensor(
-            [[[
-                [1., 2., 3., 4., 5., 6.],
-                [1., 2., 3., 4., 5., 6.],
-                [1., 2., 3., 4., 5., 6.],
-                [1., 2., 3., 4., 5., 6.],
-                [1., 2., 3., 4., 5., 6.],
-            ]; 4]; 3],
-        );
+        let t = dev
+            .tensor(
+                [[[
+                    [1., 2., 3., 4., 5., 6.],
+                    [1., 2., 3., 4., 5., 6.],
+                    [1., 2., 3., 4., 5., 6.],
+                    [1., 2., 3., 4., 5., 6.],
+                    [1., 2., 3., 4., 5., 6.],
+                ]; 4]; 3],
+            )
+            .to_dtype::<TestDtype>();
         assert_close_to_literal!(
             t.clone().lower_tri(None),
             [[[
diff --git a/src/tensor_ops/upscale2d/cuda_kernel.rs b/src/tensor_ops/upscale2d/cuda_kernel.rs
index 2cc7e9049..19bba285c 100644
--- a/src/tensor_ops/upscale2d/cuda_kernel.rs
+++ b/src/tensor_ops/upscale2d/cuda_kernel.rs
@@ -25,6 +25,16 @@ trait HasCudaKernel<E, Mode> {
     const FWD: &'static str;
     const BWD: &'static str;
 }
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16, NearestNeighbor> for Cuda {
+    const FWD: &'static str = "nearest_upscale2d_fwd_f16";
+    const BWD: &'static str = "nearest_upscale2d_bwd_f16";
+}
+#[cfg(feature = "f16")]
+impl HasCudaKernel<half::f16, Bilinear> for Cuda {
+    const FWD: &'static str = "bilinear_upscale2d_fwd_f16";
+    const BWD: &'static str = "bilinear_upscale2d_bwd_f16";
+}
 impl HasCudaKernel<f32, NearestNeighbor> for Cuda {
     const FWD: &'static str = "nearest_upscale2d_fwd_f32";
     const BWD: &'static str = "nearest_upscale2d_bwd_f32";
diff --git a/src/tensor_ops/upscale2d/mod.rs b/src/tensor_ops/upscale2d/mod.rs
index 8e60acd10..ea3a0e599 100644
--- a/src/tensor_ops/upscale2d/mod.rs
+++ b/src/tensor_ops/upscale2d/mod.rs
@@ -254,7 +254,9 @@ mod tests {
     fn test_upscale2d_nearest_even() {
         let dev = TestDevice::default();
 
-        let x = dev.tensor([[[1.0, 0.0], [2.0, 3.0]]]);
+        let x = dev
+            .tensor([[[1.0, 0.0], [2.0, 3.0]]])
+            .to_dtype::<TestDtype>();
         let y = x.leaky_trace().upscale2d::<4, 4, _>(NearestNeighbor);
         assert_close_to_literal!(
             y,
@@ -277,7 +279,9 @@ mod tests {
     fn test_upscale2d_nearest_uneven() {
         let dev = TestDevice::default();
 
-        let x = dev.tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]]);
+        let x = dev
+            .tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]])
+            .to_dtype::<TestDtype>();
         let y = x.leaky_trace().upscale2d::<2, 7, _>(NearestNeighbor);
         assert_close_to_literal!(
             y,
@@ -301,24 +305,23 @@ mod tests {
     fn test_upscale2d_nearest_batched() {
         let dev = TestDevice::default();
 
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]);
+        let x = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            .to_dtype::<TestDtype>();
         let x: Tensor<Rank3<3, 2, 3>, _, _> = [x.clone(), x.clone(), x].stack();
         let x: Tensor<Rank4<5, 3, 2, 3>, _, _> =
             [x.clone(), x.clone(), x.clone(), x.clone(), x].stack();
         let y = x.leaky_trace().upscale2d::<5, 6, _>(NearestNeighbor);
-        let y_array = y.array();
-        for img in y_array {
-            assert_eq!(
-                img,
-                [[
-                    [1., 1., 2., 2., 3., 3.],
-                    [1., 1., 2., 2., 3., 3.],
-                    [1., 1., 2., 2., 3., 3.],
-                    [4., 4., 5., 5., 6., 6.],
-                    [4., 4., 5., 5., 6., 6.]
-                ]; 3]
-            );
-        }
+        assert_close_to_literal!(
+            y,
+            [[[
+                [1., 1., 2., 2., 3., 3.],
+                [1., 1., 2., 2., 3., 3.],
+                [1., 1., 2., 2., 3., 3.],
+                [4., 4., 5., 5., 6., 6.],
+                [4., 4., 5., 5., 6., 6.]
+            ]; 3]; 5]
+        );
 
         let grads = y.exp().mean().backward();
         assert_close_to_literal!(
@@ -335,7 +338,9 @@ mod tests {
     fn test_upscale2d_bilinear_even() {
         let dev = TestDevice::default();
 
-        let x = dev.tensor([[[1.0, 0.0], [2.0, 3.0]]]);
+        let x = dev
+            .tensor([[[1.0, 0.0], [2.0, 3.0]]])
+            .to_dtype::<TestDtype>();
         let y = x.leaky_trace().upscale2d::<4, 4, _>(Bilinear);
         assert_close_to_literal!(
             y,
@@ -358,7 +363,9 @@ mod tests {
     fn test_upscale2d_bilinear_uneven() {
         let dev = TestDevice::default();
 
-        let x = dev.tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]]);
+        let x = dev
+            .tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]])
+            .to_dtype::<TestDtype>();
         let y = x.leaky_trace().upscale2d::<2, 7, _>(Bilinear);
         assert_close_to_literal!(
             y,
@@ -382,7 +389,9 @@ mod tests {
     fn test_bilinear_upscale2d_batched() {
         let dev = TestDevice::default();
 
-        let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]);
+        let x = dev
+            .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            .to_dtype::<TestDtype>();
         let x: Tensor<Rank3<3, 2, 3>, _, _> = [x.clone(), x.clone(), x].stack();
         let x: Tensor<Rank4<5, 3, 2, 3>, _, _> =
             [x.clone(), x.clone(), x.clone(), x.clone(), x].stack();
diff --git a/src/tensor_ops/upscale2d/upscale2d.cu b/src/tensor_ops/upscale2d/upscale2d.cu
index 49ad1b91a..2953960ea 100644
--- a/src/tensor_ops/upscale2d/upscale2d.cu
+++ b/src/tensor_ops/upscale2d/upscale2d.cu
@@ -106,9 +106,11 @@ __device__ void bilinear_upscale2d_fwd(
 
     inp += b * inp_strides[0] + c * inp_strides[1];
 
-    T ll = inp[y0 * inp_strides[2] + x0 * inp_strides[3]] * (1-hs) * (1-ws);
-    T lh = inp[y0 * inp_strides[2] + x1 * inp_strides[3]] * (1-hs) * ws;
-    T hl = inp[y1 * inp_strides[2] + x0 * inp_strides[3]] * hs * (1-ws);
+    T one = 1.0;
+
+    T ll = inp[y0 * inp_strides[2] + x0 * inp_strides[3]] * (one-hs) * (one-ws);
+    T lh = inp[y0 * inp_strides[2] + x1 * inp_strides[3]] * (one-hs) * ws;
+    T hl = inp[y1 * inp_strides[2] + x0 * inp_strides[3]] * hs * (one-ws);
     T hh = inp[y1 * inp_strides[2] + x1 * inp_strides[3]] * hs * ws;
 
     out[i] = ll + lh + hl + hh;
@@ -150,9 +152,11 @@ __device__ void bilinear_upscale2d_bwd(
 
     grad_inp += b * inp_strides[0] + c * inp_strides[1];
 
-    atomicAdd(grad_inp + y0 * inp_strides[2] + x0 * inp_strides[3], go * (1-hs) * (1-ws));
-    atomicAdd(grad_inp + y0 * inp_strides[2] + x1 * inp_strides[3], go * (1-hs) * ws);
-    atomicAdd(grad_inp + y1 * inp_strides[2] + x0 * inp_strides[3], go * hs * (1-ws));
+    const T one = 1.0;
+
+    atomicAdd(grad_inp + y0 * inp_strides[2] + x0 * inp_strides[3], go * (one-hs) * (one-ws));
+    atomicAdd(grad_inp + y0 * inp_strides[2] + x1 * inp_strides[3], go * (one-hs) * ws);
+    atomicAdd(grad_inp + y1 * inp_strides[2] + x0 * inp_strides[3], go * hs * (one-ws));
     atomicAdd(grad_inp + y1 * inp_strides[2] + x1 * inp_strides[3], go * hs * ws);
 }
 
@@ -175,13 +179,19 @@ extern "C" __global__ void bwd( \
 }
 
 UPSCALE_OP(
-    float,
-    nearest_upscale2d_fwd_f32, nearest_upscale2d_bwd_f32,
+    __half,
+    nearest_upscale2d_fwd_f16, nearest_upscale2d_bwd_f16,
     nearest_upscale2d_fwd, nearest_upscale2d_bwd
 );
 UPSCALE_OP(
-    double,
-    nearest_upscale2d_fwd_f64, nearest_upscale2d_bwd_f64,
+    __half,
+    bilinear_upscale2d_fwd_f16, bilinear_upscale2d_bwd_f16,
+    bilinear_upscale2d_fwd, bilinear_upscale2d_bwd
+);
+
+UPSCALE_OP(
+    float,
+    nearest_upscale2d_fwd_f32, nearest_upscale2d_bwd_f32,
     nearest_upscale2d_fwd, nearest_upscale2d_bwd
 );
 UPSCALE_OP(
@@ -189,6 +199,11 @@ UPSCALE_OP(
     bilinear_upscale2d_fwd_f32, bilinear_upscale2d_bwd_f32,
     bilinear_upscale2d_fwd, bilinear_upscale2d_bwd
 );
+UPSCALE_OP(
+    double,
+    nearest_upscale2d_fwd_f64, nearest_upscale2d_bwd_f64,
+    nearest_upscale2d_fwd, nearest_upscale2d_bwd
+);
 UPSCALE_OP(
     double,
     bilinear_upscale2d_fwd_f64, bilinear_upscale2d_bwd_f64,
diff --git a/src/tensor_ops/utilities/binary_op_macros.cuh b/src/tensor_ops/utilities/binary_op_macros.cuh
index 0212c215f..9878c3239 100644
--- a/src/tensor_ops/utilities/binary_op_macros.cuh
+++ b/src/tensor_ops/utilities/binary_op_macros.cuh
@@ -68,8 +68,9 @@ extern "C" __global__ void BACKWARD_LHS( \
         tmp_i /= dims[d]; \
     } \
     unsigned int lhs_i = i / chunk_len; \
-    TYPENAME x = lhs ? lhs[lhs_i] : 0; \
-    TYPENAME y = rhs ? rhs[rhs_i] : 0; \
+    TYPENAME zero = 0.0; \
+    TYPENAME x = lhs ? lhs[lhs_i] : zero; \
+    TYPENAME y = rhs ? rhs[rhs_i] : zero; \
     TYPENAME go = grad_out[out_i]; \
 \
     TYPENAME dfdx = (DFDX); \
@@ -107,8 +108,9 @@ extern "C" __global__ void BACKWARD_RHS( \
     } \
     unsigned int rhs_i = i / chunk_len; \
 \
-    TYPENAME x = lhs ? lhs[lhs_i] : 0; \
-    TYPENAME y = rhs ? rhs[rhs_i] : 0; \
+    TYPENAME zero = 0.0; \
+    TYPENAME x = lhs ? lhs[lhs_i] : zero; \
+    TYPENAME y = rhs ? rhs[rhs_i] : zero; \
     TYPENAME go = grad_out[out_i]; \
 \
     TYPENAME dfdy = (DFDY); \
diff --git a/src/tensor_ops/utilities/compatibility.cuh b/src/tensor_ops/utilities/compatibility.cuh
new file mode 100644
index 000000000..93195a7f4
--- /dev/null
+++ b/src/tensor_ops/utilities/compatibility.cuh
@@ -0,0 +1,171 @@
+#include "cuda_fp16.h"
+
+// Table showing which features are supported on which compute capability
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
+
+// FIXME: the minimum compute capabilities are just guesses since the table is not specific enough
+
+#if __CUDA_ARCH__ < 600
+__device__ __forceinline__ __half __hmax(__half a, __half b) {
+    return __float2half(fmaxf(__half2float(a), __half2float(b)));
+}
+__device__ __forceinline__ __half __hmin(__half a, __half b) {
+    return __float2half(fminf(__half2float(a), __half2float(b)));
+}
+#endif
+
+#if __CUDA_ARCH__ < 700
+__device__ __forceinline__ __half __hmax_nan(__half a, __half b) {
+    return __hisnan(a) ? a : (__hisnan(b) ? b : __hmax(a, b));
+}
+__device__ __forceinline__ __half __hmin_nan(__half a, __half b) {
+    return __hisnan(a) ? a : (__hisnan(b) ? b : __hmin(a, b));
+}
+#endif
+
+#if __CUDA_ARCH__ < 600
+// Copied from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+__device__ double atomicAdd(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#endif
+
+
+#if __CUDA_ARCH__ < 700
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd
+// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
+// Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
+__device__ __half atomicAdd(__half *address, __half val) {
+    unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    bool unaligned = (size_t) address & 2;
+    do {
+        assumed = old;
+        unsigned int hsum;
+        hsum = unaligned ? (old >> 16) : (old & 0xffff);
+        hsum = __half_as_ushort(__ushort_as_half(hsum) + val); 
+        old = atomicCAS(address_as_ui, assumed,
+            unaligned ? (old & 0xffff) | (hsum << 16) : (old & 0xffff0000) | hsum
+        );
+
+   } while (assumed != old);
+   return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
+}
+#endif
+
+
+__device__ __forceinline__ __half atomicMaxf(__half* address, __half val) {
+#if __CUDA_ARCH__ < 700
+    // On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery.
+    // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
+    unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    bool unaligned = (size_t) address & 2;
+    do {
+        assumed = old;
+        unsigned int hmax;
+        hmax = unaligned ? (old >> 16) : (old & 0xffff);
+        hmax = __half_as_ushort(__hmax_nan(val, __ushort_as_half(hmax))); 
+        old = atomicCAS(address_as_ui, assumed,
+            unaligned ? (old & 0xffff) | (hmax << 16) : (old & 0xffff0000) | hmax
+        );
+
+    } while (assumed != old);
+    return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
+#else
+    // Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+    unsigned short int* casted_address = (unsigned short int*)address;
+    unsigned short int old = *casted_address;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmax_nan(val, __ushort_as_half(assumed))));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+    return __ushort_as_half(old);
+#endif
+}
+
+// atomicMax is not implemented for floats,
+// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ float atomicMaxf(float * addr, float value) {
+    if (signbit(value)) {
+        return __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value)));        
+    } else {
+        return __int_as_float(atomicMax((int *)addr, __float_as_int(value)));
+    }
+}
+
+__device__ __forceinline__ double atomicMaxf(double * addr, double value) {
+    if (signbit(value)) {
+        return __longlong_as_double(atomicMin((unsigned long long int *)addr, __double_as_longlong(value)));
+    } else {
+        return __longlong_as_double(atomicMax((long long int *)addr, __double_as_longlong(value)));
+    }
+}
+
+
+__device__ __forceinline__ __half atomicMinf(__half* address, __half val) {
+#if __CUDA_ARCH__ < 700
+    // On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery.
+    // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119
+    unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    bool unaligned = (size_t) address & 2;
+    do {
+        assumed = old;
+        unsigned int hmin;
+        hmin = unaligned ? (old >> 16) : (old & 0xffff);
+        hmin = __half_as_ushort(__hmin_nan(val, __ushort_as_half(hmin))); 
+        old = atomicCAS(address_as_ui, assumed,
+            unaligned ? (old & 0xffff) | (hmin << 16) : (old & 0xffff0000) | hmin
+        );
+
+    } while (assumed != old);
+    return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff));
+#else
+    // Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions
+    unsigned short int* casted_address = (unsigned short int*)address;
+    unsigned short int old = *casted_address;
+    unsigned short int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmin_nan(val, __ushort_as_half(assumed))));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+    return __ushort_as_half(old);
+#endif
+}
+
+// atomicMin is not implemented for floats,
+// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ float atomicMinf(float * addr, float value) {
+    if (signbit(value)) {
+        return __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));
+    } else {
+        return __int_as_float(atomicMin((int *)addr, __float_as_int(value)));
+    }
+}
+
+__device__ __forceinline__ double atomicMinf(double * addr, double value) {
+    if (signbit(value)) {
+        return __longlong_as_double(atomicMax((unsigned long long int *)addr, __double_as_longlong(value)));
+    } else {
+        return __longlong_as_double(atomicMin((long long int *)addr, __double_as_longlong(value)));
+    }
+}
\ No newline at end of file
diff --git a/src/tensor_ops/utilities/cuda_utils.cuh b/src/tensor_ops/utilities/cuda_utils.cuh
index 0ebe1470a..5915107f8 100644
--- a/src/tensor_ops/utilities/cuda_utils.cuh
+++ b/src/tensor_ops/utilities/cuda_utils.cuh
@@ -1,4 +1,5 @@
 #include "cuda_fp16.h"
+#include "compatibility.cuh"
 
 __device__ unsigned int get_strided_index(
     unsigned int idx,
@@ -93,6 +94,14 @@ __device__ void chunk_sum(
     }
 }
 
+extern "C" __global__ void fill_with_f16(__half *buf, __half value, const size_t numel) {
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= numel) {
+        return;
+    }
+    buf[i] = value;
+}
+
 extern "C" __global__ void fill_with_f32(float *buf, float value, const size_t numel) {
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= numel) {
@@ -109,21 +118,43 @@ extern "C" __global__ void fill_with_f64(double *buf, double value, const size_t
     buf[i] = value;
 }
 
+
+__device__ __forceinline__ bool isnang(float a) { return isnan(a); }
+__device__ __forceinline__ bool isnang(double a) { return isnan(a); }
+__device__ __forceinline__ bool isnang(__half a) { return __hisnan(a); }
+__device__ __forceinline__ float recipg(float a) { return 1.0 / a; }
+__device__ __forceinline__ double recipg(double a) { return 1.0 / a; }
+__device__ __forceinline__ __half recipg(__half a) { __half one = 1.0; return one / a; }
+__device__ __forceinline__ float cosg(float a) { return cosf(a); }
+__device__ __forceinline__ double cosg(double a) { return cos(a); }
+__device__ __forceinline__ __half cosg(__half a) { return hcos(a); }
+__device__ __forceinline__ float sing(float a) { return sinf(a); }
+__device__ __forceinline__ double sing(double a) { return sin(a); }
+__device__ __forceinline__ __half sing(__half a) { return hsin(a); }
 __device__ __forceinline__ float sqrtg(float a) { return sqrtf(a); }
 __device__ __forceinline__ double sqrtg(double a) { return sqrt(a); }
+__device__ __forceinline__ __half sqrtg(__half a) { return hsqrt(a); }
 __device__ __forceinline__ float powg(float a, float b) { return powf(a, b); }
 __device__ __forceinline__ double powg(double a, double b) { return pow(a, b); }
+__device__ __forceinline__ __half powg(__half a, __half b) { return __float2half(powf(__half2float(a), __half2float(b))); }
 __device__ __forceinline__ float tanhg(float a) { return tanhf(a); }
 __device__ __forceinline__ double tanhg(double a) { return tanh(a); }
+__device__ __forceinline__ __half tanhg(__half a) { return __float2half(tanhf(__half2float(a))); }
 __device__ __forceinline__ float maxg(float a, float b) { return fmaxf(a, b); }
 __device__ __forceinline__ double maxg(double a, double b) { return fmax(a, b); }
+__device__ __forceinline__ __half maxg(__half a, __half b) { return __hmax_nan(a, b); }
 __device__ __forceinline__ float ming(float a, float b) { return fminf(a, b); }
 __device__ __forceinline__ double ming(double a, double b) { return fmin(a, b); }
+__device__ __forceinline__ __half ming(__half a, __half b) { return __hmin_nan(a, b); }
 __device__ __forceinline__ float logg(float a) { return logf(a); }
 __device__ __forceinline__ double logg(double a) { return log(a); }
+__device__ __forceinline__ __half logg(__half a) { return hlog(a); }
 __device__ __forceinline__ float expg(float a) { return expf(a); }
 __device__ __forceinline__ double expg(double a) { return exp(a); }
+__device__ __forceinline__ __half expg(__half a) { return hexp(a); }
 __device__ __forceinline__ float absg(float a) { return fabsf(a); }
 __device__ __forceinline__ double absg(double a) { return fabs(a); }
+__device__ __forceinline__ __half absg(__half a) { return __habs(a); }
 __device__ __forceinline__ float copysigng(float a, float b) { return copysignf(a, b); }
 __device__ __forceinline__ double copysigng(double a, double b) { return copysign(a, b); }
+__device__ __forceinline__ __half copysigng(__half a, __half b) { return __float2half(copysignf(__half2float(a), __half2float(b))); }
diff --git a/src/tensor_ops/utilities/device.rs b/src/tensor_ops/utilities/device.rs
index c209bd9fd..e20cee99d 100644
--- a/src/tensor_ops/utilities/device.rs
+++ b/src/tensor_ops/utilities/device.rs
@@ -102,9 +102,14 @@ pub trait Device<E: Dtype>:
 {
 }
 
+#[cfg(feature = "f16")]
+impl Device<half::f16> for crate::tensor::Cpu {}
 impl Device<f32> for crate::tensor::Cpu {}
 impl Device<f64> for crate::tensor::Cpu {}
 
+#[cfg(all(feature = "cuda", feature = "f16"))]
+impl Device<half::f16> for crate::tensor::Cuda {}
+
 #[cfg(feature = "cuda")]
 impl Device<f32> for crate::tensor::Cuda {}
 
diff --git a/src/tensor_ops/utilities/unary_op_macros.cuh b/src/tensor_ops/utilities/unary_op_macros.cuh
index e86322e61..4fafc7a13 100644
--- a/src/tensor_ops/utilities/unary_op_macros.cuh
+++ b/src/tensor_ops/utilities/unary_op_macros.cuh
@@ -1,3 +1,5 @@
+#include "cuda_utils.cuh"
+
 #define LONG_UNARY_OP(TYPENAME, FORWARD, BACKWARD, OP_STRUCT, FUNC, DERIVATIVE) \
 extern "C" __global__ void FORWARD( \
     const OP_STRUCT op, \
@@ -26,8 +28,9 @@ extern "C" __global__ void BACKWARD( \
         return; \
     } \
     \
-    TYPENAME x = inp ? inp[i] : 0; \
-    TYPENAME y = out ? out[i] : 0; \
+    TYPENAME zero = 0.0; \
+    TYPENAME x = inp ? inp[i] : zero; \
+    TYPENAME y = out ? out[i] : zero; \
     TYPENAME dx; \
     DERIVATIVE \
     grad_inp[i] += dx * grad_out[i]; \
diff --git a/src/tensor_ops/var_to.rs b/src/tensor_ops/var_to.rs
index 423cf8b59..8dd5f003d 100644
--- a/src/tensor_ops/var_to.rs
+++ b/src/tensor_ops/var_to.rs
@@ -48,7 +48,9 @@ mod tests {
     #[test]
     fn test_var_axis_0_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().var::<Rank1<4>, _>();
         assert_close_to_literal!(r, [0.25, 0.0, 1.0, 9.0]);
         let g = r.mean().backward();
@@ -61,7 +63,9 @@ mod tests {
     #[test]
     fn test_var_axis_1_2d() {
         let dev: TestDevice = Default::default();
-        let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]);
+        let t = dev
+            .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]])
+            .to_dtype::<TestDtype>();
         let r = t.leaky_trace().var::<Rank1<2>, _>();
         assert_close_to_literal!(r, [1.25, 14.1875]);
         let g = r.mean().backward();