diff --git a/Cargo.toml b/Cargo.toml index 9c161cfe8..f1bad8cef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,10 +33,11 @@ matrixmultiply = { version = "0.3.2", default-features = false, optional = true zip = { version = "0.6.2", default-features = false, optional = true } cblas-sys = { version = "0.1.4", default-features = false, optional = true } libc = { version = "0.2", default-features = false, optional = true } -cudarc = { version = "0.9.7", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] } +cudarc = { git = "https://github.com/coreylowman/cudarc", branch = "dfdx-half", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc", "f16"] } num-traits = { version = "0.2.15", default-features = false } safetensors = { version = "0.3", default-features = false, optional = true } memmap2 = { version = "0.5", default-features = false, optional = true } +half = { git = "https://github.com/starkat99/half-rs.git", branch = "main", optional = true, features = ["num-traits", "rand_distr"] } [dev-dependencies] tempfile = "3.3.0" @@ -48,7 +49,7 @@ glob = { version = "0.3.1", optional = true } [features] default = ["std", "fast-alloc", "cpu-par-matmul"] -nightly = [] +nightly = ["half?/use-intrinsics"] std = ["cudarc?/std", "matrixmultiply?/std", "rand_distr/std_math"] fast-alloc = ["std"] @@ -61,9 +62,12 @@ cpu-mkl-matmul = ["dep:cblas-sys", "dep:libc"] cuda = ["dep:cudarc", "dep:glob"] cudnn = ["cuda", "cudarc?/cudnn"] +f16 = ["dep:half"] + numpy = ["dep:zip", "std"] safetensors = ["dep:safetensors", "std", "dep:memmap2"] +test-f16 = ["f16"] test-f64 = [] test-integrations = [] ci-check = ["cudarc?/ci-check"] diff --git a/build.rs b/build.rs index 3860f5fb0..826c3e9ce 100644 --- a/build.rs +++ b/build.rs @@ -4,6 +4,9 @@ fn main() { // If on nightly, enable "nightly" feature maybe_enable_nightly(); + #[cfg(feature = "cuda")] + cuda::set_include_dir(); + #[cfg(feature = "cuda")] cuda::build_ptx(); @@ -25,6 +28,52 @@ fn maybe_enable_nightly() { #[cfg(feature = "cuda")] mod cuda { + pub fn set_include_dir() { + // NOTE: copied from cudarc build.rs. + // We can't actually set a env!() value from another crate, + // so we have to do that here. + + use std::path::PathBuf; + + let env_vars = [ + "CUDA_PATH", + "CUDA_ROOT", + "CUDA_TOOLKIT_ROOT_DIR", + "CUDNN_LIB", + ]; + #[allow(unused)] + let env_vars = env_vars + .into_iter() + .map(std::env::var) + .filter_map(Result::ok) + .map(Into::::into); + + let roots = [ + "/usr", + "/usr/local/cuda", + "/opt/cuda", + "/usr/lib/cuda", + "C:/Program Files/NVIDIA GPU Computing Toolkit", + "C:/CUDA", + ]; + #[allow(unused)] + let roots = roots.into_iter().map(Into::::into); + + #[cfg(feature = "ci-check")] + let root: PathBuf = "ci".into(); + + #[cfg(not(feature = "ci-check"))] + let root = env_vars + .chain(roots) + .find(|path| path.join("include").join("cuda.h").is_file()) + .unwrap(); + + println!( + "cargo:rustc-env=CUDA_INCLUDE_DIR={}", + root.join("include").display() + ); + } + pub fn build_ptx() { let out_dir = std::env::var("OUT_DIR").unwrap(); let kernel_paths: Vec = glob::glob("src/**/*.cu") @@ -38,6 +87,10 @@ mod cuda { for path in &mut include_directories { println!("cargo:rerun-if-changed={}", path.display()); + let destination = + std::format!("{out_dir}/{}", path.file_name().unwrap().to_str().unwrap()); + println!("cargo:rerun-if-changed={}", destination); + std::fs::copy(path.clone(), destination).unwrap(); // remove the filename from the path so it's just the directory path.pop(); } @@ -130,6 +183,8 @@ mod cuda { .args(["--output-directory", &out_dir]) .args(&include_options) .arg(p) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) .spawn() .unwrap() }) @@ -139,7 +194,9 @@ mod cuda { let output = child.wait_with_output().unwrap(); assert!( output.status.success(), - "nvcc error while compiling {kernel_path:?}: {output:?}", + "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) ); } diff --git a/src/lib.rs b/src/lib.rs index 0aef06d65..60915a9d6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -241,6 +241,7 @@ pub fn keep_denormals() { #[cfg(test)] pub(crate) mod tests { + pub use num_traits::{Float, FromPrimitive, NumCast, Zero}; #[cfg(not(feature = "cuda"))] pub type TestDevice = crate::tensor::Cpu; @@ -248,9 +249,15 @@ pub(crate) mod tests { #[cfg(feature = "cuda")] pub type TestDevice = crate::tensor::Cuda; - #[cfg(not(feature = "test-f64"))] + #[cfg(all(feature = "test-f64", feature = "test-f16"))] + compile_error!("f64 and f16 cannot be tested at the same time"); + + #[cfg(all(not(feature = "test-f16"), not(feature = "test-f64")))] pub type TestDtype = f32; + #[cfg(feature = "test-f16")] + pub type TestDtype = half::f16; + #[cfg(feature = "test-f64")] pub type TestDtype = f64; @@ -275,6 +282,19 @@ pub(crate) mod tests { } } + #[cfg(feature = "f16")] + impl AssertClose for half::f16 { + type Elem = Self; + const DEFAULT_TOLERANCE: Self::Elem = half::f16::from_f32_const(1e-2); + fn get_far_pair(&self, rhs: &Self, tolerance: Self) -> Option<(Self, Self)> { + if num_traits::Float::abs(self - rhs) > tolerance { + Some((*self, *rhs)) + } else { + None + } + } + } + impl AssertClose for f32 { type Elem = f32; const DEFAULT_TOLERANCE: Self::Elem = 1e-6; @@ -349,12 +369,9 @@ pub(crate) mod tests { macro_rules! assert_close_to_literal { ($Lhs:expr, $Rhs:expr) => {{ let lhs = $Lhs.array(); + let rhs = $Rhs.ndmap(|x| num_traits::FromPrimitive::from_f64(x).unwrap()); let tol = AssertClose::get_default_tol(&lhs); - let far_pair = AssertClose::get_far_pair( - &lhs, - &$Rhs.ndmap(|x| num_traits::FromPrimitive::from_f64(x).unwrap()), - tol, - ); + let far_pair = AssertClose::get_far_pair(&lhs, &rhs, tol); if let Some((l, r)) = far_pair { panic!("lhs != rhs | {l} != {r}"); } @@ -411,5 +428,6 @@ pub(crate) mod tests { } }}; } + pub(crate) use assert_close; } diff --git a/src/losses.rs b/src/losses.rs index e09ef67fc..5da12f2a3 100644 --- a/src/losses.rs +++ b/src/losses.rs @@ -47,7 +47,7 @@ pub fn mae_loss, T: Tape>( pub fn huber_loss, T: Tape>( pred: Tensor, targ: Tensor, - delta: impl Into, + delta: impl Into, ) -> Tensor { pred.huber_error(targ, delta).mean() } @@ -62,10 +62,10 @@ pub fn huber_loss, T: Tape>( pub fn smooth_l1_loss, T: Tape>( pred: Tensor, targ: Tensor, - delta: impl Into, + delta: impl Into, ) -> Tensor { - let delta = delta.into(); - huber_loss(pred, targ, delta) / delta + let delta: f64 = delta.into(); + huber_loss(pred, targ, delta) / E::from_f64(delta).unwrap() } /// [Cross entropy loss](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_loss_function_and_logistic_regression). @@ -132,10 +132,12 @@ mod tests { #[test] fn test_mse() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = - dev.tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]); - let y: Tensor<_, TestDtype, _> = - dev.tensor([-0.90954804, -1.0193185, -0.39221755, 2.2524886, 1.3035554]); + let x = dev + .tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]) + .to_dtype::(); + let y = dev + .tensor([-0.90954804, -1.0193185, -0.39221755, 2.2524886, 1.3035554]) + .to_dtype::(); let loss = mse_loss(x.leaky_trace(), y); assert_close_to_literal!(loss, 1.0846305); let g = loss.backward(); @@ -148,10 +150,12 @@ mod tests { #[test] fn test_mae() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = - dev.tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]); - let y: Tensor<_, TestDtype, _> = - dev.tensor([-0.90954804, -1.0193186, -0.39221755, 2.2524886, 1.3035554]); + let x = dev + .tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]) + .to_dtype::(); + let y = dev + .tensor([-0.90954804, -1.0193186, -0.39221755, 2.2524886, 1.3035554]) + .to_dtype::(); let loss = mae_loss(x.leaky_trace(), y); assert_close_to_literal!(loss, 0.9042107); let g = loss.backward(); @@ -161,14 +165,18 @@ mod tests { #[test] fn test_soft_cross_entropy() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([ - [0.01322946, 0.7367754, -0.8874471, 0.6997109, 0.98312855], - [-0.19822043, 1.192167, -0.7495395, -1.5733303, -1.4898887], - ]); - let y: Tensor<_, TestDtype, _> = dev.tensor([ - [0.3180433, 0.15164024, 0.2352255, 0.08821669, 0.20687431], - [0.15627657, 0.29779273, 0.10897867, 0.2879545, 0.14899758], - ]); + let x = dev + .tensor([ + [0.01322946, 0.7367754, -0.8874471, 0.6997109, 0.98312855], + [-0.19822043, 1.192167, -0.7495395, -1.5733303, -1.4898887], + ]) + .to_dtype::(); + let y = dev + .tensor([ + [0.3180433, 0.15164024, 0.2352255, 0.08821669, 0.20687431], + [0.15627657, 0.29779273, 0.10897867, 0.2879545, 0.14899758], + ]) + .to_dtype::(); let loss = cross_entropy_with_logits_loss(x.leaky_trace(), y.clone()); assert_close_to_literal!(loss, 1.9889611); let g = loss.backward(); @@ -191,13 +199,14 @@ mod tests { #[test] fn test_hard_crossentropy() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = - dev.tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]); + let x = dev + .tensor([0.87248087, -0.24252531, -1.0060949, 1.155084, 1.5545048]) + .to_dtype::(); let losses = [1.5655229, 2.680529, 3.444099, 1.2829198, 0.883499]; for i in 0..5 { let mut targ = [0.0; 5]; targ[i] = 1.0; - let y = dev.tensor(targ); + let y = dev.tensor(targ).to_dtype::(); let loss = cross_entropy_with_logits_loss(x.leaky_trace(), y.clone()); assert_close_to_literal!(loss, losses[i]); } @@ -206,20 +215,24 @@ mod tests { #[test] fn test_kl_div() { let dev: TestDevice = Default::default(); - let logits: Tensor<_, TestDtype, _> = dev.tensor([ - [-0.2354, 0.4408, 0.9688], - [-0.2187, -0.3451, -1.5473], - [0.7420, 0.7186, 1.0785], - [-1.2231, 0.2536, 0.3489], - [-0.9163, -0.2289, 0.2576], - ]); - let targ: Tensor<_, TestDtype, _> = dev.tensor([ - [0.3178, 0.5344, 0.1479], - [0.1915, 0.6178, 0.1907], - [0.4834, 0.1789, 0.3377], - [0.5809, 0.3623, 0.0568], - [0.0166, 0.8512, 0.1322], - ]); + let logits = dev + .tensor([ + [-0.2354, 0.4408, 0.9688], + [-0.2187, -0.3451, -1.5473], + [0.7420, 0.7186, 1.0785], + [-1.2231, 0.2536, 0.3489], + [-0.9163, -0.2289, 0.2576], + ]) + .to_dtype::(); + let targ = dev + .tensor([ + [0.3178, 0.5344, 0.1479], + [0.1915, 0.6178, 0.1907], + [0.4834, 0.1789, 0.3377], + [0.5809, 0.3623, 0.0568], + [0.0166, 0.8512, 0.1322], + ]) + .to_dtype::(); let loss = kl_div_with_logits_loss(logits.leaky_trace(), targ); assert_close_to_literal!(loss, 0.40656143); let g = loss.backward(); @@ -238,16 +251,20 @@ mod tests { #[test] fn test_bce() { let dev: TestDevice = Default::default(); - let logit: Tensor<_, TestDtype, _> = dev.tensor([ - [-0.4092005, -0.6706018, 0.9201696], - [-1.6583557, 1.6978683, -1.4827578], - [-0.9571696, -1.0971526, 0.8801755], - ]); - let prob: Tensor<_, TestDtype, _> = dev.tensor([ - [0.365251, 0.8322099, 0.482717], - [0.168392, 0.7987092, 0.1177533], - [0.7026833, 0.5563793, 0.6429267], - ]); + let logit = dev + .tensor([ + [-0.4092005, -0.6706018, 0.9201696], + [-1.6583557, 1.6978683, -1.4827578], + [-0.9571696, -1.0971526, 0.8801755], + ]) + .to_dtype::(); + let prob = dev + .tensor([ + [0.365251, 0.8322099, 0.482717], + [0.168392, 0.7987092, 0.1177533], + [0.7026833, 0.5563793, 0.6429267], + ]) + .to_dtype::(); let loss = binary_cross_entropy_with_logits_loss(logit.leaky_trace(), prob.clone()); assert_close_to_literal!(loss, 0.7045728); @@ -275,9 +292,10 @@ mod tests { #[test] fn test_bce_wide_range() { let dev: TestDevice = Default::default(); - let logit: Tensor<_, TestDtype, _> = - dev.tensor([[100.0; 3], [-100.0; 3], [-1.0, 0.0, 1.0]]); - let targ: Tensor<_, TestDtype, _> = dev.tensor([[0.0, 0.5, 1.0]; 3]); + let logit = dev + .tensor([[100.0; 3], [-100.0; 3], [-1.0, 0.0, 1.0]]) + .to_dtype::(); + let targ = dev.tensor([[0.0, 0.5, 1.0]; 3]).to_dtype::(); let loss = binary_cross_entropy_with_logits_loss(logit.leaky_trace(), targ.clone()); assert_close_to_literal!(loss, 33.479964); @@ -306,16 +324,20 @@ mod tests { #[test] fn test_huber_loss() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([ - [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708], - [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786], - [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133], - ]); - let y: Tensor<_, TestDtype, _> = dev.tensor([ - [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708], - [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786], - [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133], - ]); + let x = dev + .tensor([ + [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708], + [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786], + [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133], + ]) + .to_dtype::(); + let y = dev + .tensor([ + [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708], + [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786], + [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133], + ]) + .to_dtype::(); let loss = huber_loss(x.leaky_trace(), y.clone(), 0.5); assert_close_to_literal!(loss, 0.24506615); @@ -342,16 +364,20 @@ mod tests { #[test] fn test_smooth_l1_loss() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([ - [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708], - [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786], - [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133], - ]); - let y: Tensor<_, TestDtype, _> = dev.tensor([ - [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708], - [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786], - [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133], - ]); + let x = dev + .tensor([ + [1.0095837, -1.0026205, -0.1126093, -0.1539351, -0.3688708], + [2.6373475, 0.6761999, -1.3586733, 0.486154, -0.6206786], + [-1.2967702, -0.1273358, 1.3558478, 0.0787393, 1.0921133], + ]) + .to_dtype::(); + let y = dev + .tensor([ + [1.2569424, -1.2246597, 0.7995769, 0.0339246, -0.3688708], + [1.472675, 0.8260061, 0.7839395, -0.0541475, -0.6206786], + [-2.0449343, 1.8117315, 1.7505344, -1.2522424, 1.0921133], + ]) + .to_dtype::(); let loss = smooth_l1_loss(x.leaky_trace(), y.clone(), 0.5); assert_close_to_literal!(loss, 0.4901323); diff --git a/src/nn/batchnorm1d.rs b/src/nn/batchnorm1d.rs index f72c9a591..e1d0674ab 100644 --- a/src/nn/batchnorm1d.rs +++ b/src/nn/batchnorm1d.rs @@ -1,5 +1,4 @@ use crate::{shapes::*, tensor::*, tensor_ops::*}; -use num_traits::FromPrimitive; use super::{ batchnorm2d::{infer_fwd, train_fwd}, @@ -66,11 +65,11 @@ pub struct BatchNorm1D { /// Spatial variance that is updated during training. Defaults to 1.0 pub running_var: Tensor, E, D>, /// Added to variance before taking sqrt for numerical stability. Defaults to 1e-5 - pub epsilon: E, + pub epsilon: f64, /// Controls exponential moving average of running stats.Defaults to 0.1 /// /// `running_stat * (1.0 - momentum) + stat * momentum`. - pub momentum: E, + pub momentum: f64, } impl> BatchNorm1D { @@ -206,8 +205,8 @@ impl> TensorCollection for BatchNor bias, running_mean, running_var, - epsilon: V::E2::from_f32(1e-5).unwrap(), - momentum: V::E2::from_f32(0.1).unwrap(), + epsilon: 1e-5, + momentum: 0.1, }, ) } diff --git a/src/nn/batchnorm2d.rs b/src/nn/batchnorm2d.rs index 120d5b04e..996ad11a7 100644 --- a/src/nn/batchnorm2d.rs +++ b/src/nn/batchnorm2d.rs @@ -25,20 +25,20 @@ pub fn train_fwd, T: Tape mean: &mut Tensor, E, D>, scale: &Tensor, E, D>, bias: &Tensor, E, D>, - epsilon: E, - momentum: E, + epsilon: f64, + momentum: f64, ) -> Result, D::Err> where S: HasAxes + ReduceShapeTo, Ax>, { - let n = E::from_usize(>::size(x.shape())).unwrap(); + let n = f64::from_usize(>::size(x.shape())).unwrap(); let shape = *x.shape(); // compute statistics for updating running stats later - on tape let mean_chan = x.retaped::().try_mean::, _>()?; // update statistics since we are training - off tape - mean.try_axpy(E::ONE - momentum, &mean_chan, momentum)?; + mean.try_axpy(1.0 - momentum, &mean_chan, momentum)?; let centered = x.try_sub(mean_chan.try_broadcast_like(&shape)?)?; @@ -48,10 +48,12 @@ where .try_mean::, _>()?; // NOTE: uses unbiased variance in running estimate - var.try_axpy(E::ONE - momentum, &var_chan, momentum * n / (n - E::ONE))?; + var.try_axpy(1.0 - momentum, &var_chan, momentum * n / (n - 1.0))?; // statistics for normalizing - on tape - let std = var_chan.try_add(epsilon)?.try_sqrt()?; + let std = var_chan + .try_add(E::from_f64(epsilon).unwrap())? + .try_sqrt()?; // record broadcast of scale & bias - on tape let scale = scale @@ -71,7 +73,7 @@ pub fn infer_fwd, Ax: Axes>( mean: &Tensor, E, D>, scale: &Tensor, E, D>, bias: &Tensor, E, D>, - epsilon: E, + epsilon: f64, ) -> Result, D::Err> where Rank1: BroadcastShapeTo, @@ -79,7 +81,10 @@ where let shape = *x.shape(); // statistics for normalizing - let std = var.clone().try_add(epsilon)?.try_sqrt()?; + let std = var + .clone() + .try_add(E::from_f64(epsilon).unwrap())? + .try_sqrt()?; let scale = scale.clone().try_div(std)?.try_broadcast_like(&shape)?; @@ -134,11 +139,11 @@ pub struct BatchNorm2D { /// Spatial variance that is updated during training. Defaults to 1.0 pub running_var: Tensor, E, D>, /// Added to variance before taking sqrt for numerical stability. Defaults to 1e-5 - pub epsilon: E, + pub epsilon: f64, /// Controls exponential moving average of running stats.Defaults to 0.1 /// /// `running_stat * (1.0 - momentum) + stat * momentum`. - pub momentum: E, + pub momentum: f64, } impl> BatchNorm2D { @@ -273,8 +278,8 @@ impl> TensorCollection for BatchNor bias, running_mean, running_var, - epsilon: V::E2::from_f32(1e-5).unwrap(), - momentum: V::E2::from_f32(0.1).unwrap(), + epsilon: 1e-5, + momentum: 0.1, }, ) } diff --git a/src/nn/conv.rs b/src/nn/conv.rs index 942ad80e9..07214ec30 100644 --- a/src/nn/conv.rs +++ b/src/nn/conv.rs @@ -185,7 +185,10 @@ mod tests { let out = m.forward(dev.sample_normal::>().leaky_trace()); let g = out.square().mean().backward(); - assert_ne!(g.get(&m.weight).array(), [[[[0.0; 3]; 3]; 2]; 4]); + assert_ne!( + g.get(&m.weight).array(), + [[[[TestDtype::zero(); 3]; 3]; 2]; 4] + ); opt.update(&mut m, &g).expect("unused params"); diff --git a/src/nn/convtrans.rs b/src/nn/convtrans.rs index 12f708e79..f338f2ebb 100644 --- a/src/nn/convtrans.rs +++ b/src/nn/convtrans.rs @@ -185,7 +185,10 @@ mod tests { let out = m.forward(dev.sample_normal::>().leaky_trace()); let g = out.square().mean().backward(); - assert_ne!(g.get(&m.weight).array(), [[[[0.0; 3]; 3]; 2]; 4]); + assert_ne!( + g.get(&m.weight).array(), + [[[[TestDtype::zero(); 3]; 3]; 2]; 4] + ); opt.update(&mut m, &g).expect("unused params"); diff --git a/src/nn/dropout.rs b/src/nn/dropout.rs index 87aa1a07f..0ed7d5183 100644 --- a/src/nn/dropout.rs +++ b/src/nn/dropout.rs @@ -74,7 +74,7 @@ impl> ModuleMut>, ) -> Result { - input.try_dropout(E::ONE / E::from_usize(N).unwrap()) + input.try_dropout(1.0 / N as f64) } } @@ -150,7 +150,7 @@ impl> ModuleMut &mut self, input: Tensor>, ) -> Result { - input.try_dropout(E::from_f32(self.p).unwrap()) + input.try_dropout(self.p) } } diff --git a/src/nn/ema.rs b/src/nn/ema.rs index 5b2d15f6d..c5643e6fb 100644 --- a/src/nn/ema.rs +++ b/src/nn/ema.rs @@ -2,10 +2,10 @@ use super::tensor_collection::*; use crate::{shapes::*, tensor::*, tensor_ops::Device}; -struct ModelEMAOp { - decay: E, +struct ModelEMAOp { + decay: f64, } -impl> TensorVisitor for ModelEMAOp { +impl> TensorVisitor for ModelEMAOp { type Viewer = (ViewTensorMut, ViewTensorRef); type Err = D::Err; type E2 = E; @@ -17,7 +17,7 @@ impl> TensorVisitor for ModelEMAOp { (dst, src): (&mut Tensor, &Tensor), ) -> Result>, Self::Err> { if opts.do_gradient_update { - dst.try_axpy(self.decay, src, E::ONE - self.decay)?; + dst.try_axpy(self.decay, src, 1.0 - self.decay)?; } Ok(None) } @@ -42,11 +42,12 @@ pub trait ModelEMA>: TensorCollection { /// /// **Only updates trainable parameters**. For example, batch normalization /// running parameters are not updated. - fn ema(&mut self, other: &Self, decay: E) { + fn ema(&mut self, other: &Self, decay: impl Into) { self.try_ema(other, decay).unwrap(); } - fn try_ema(&mut self, other: &Self, decay: E) -> Result<(), D::Err> { + fn try_ema(&mut self, other: &Self, decay: impl Into) -> Result<(), D::Err> { + let decay = decay.into(); let mut op = ModelEMAOp { decay }; Self::iter_tensors(&mut RecursiveWalker { m: (self, other), @@ -75,7 +76,7 @@ mod tests { ema1.1 .1.running_var.fill_with_distr(distr); let ema0 = ema1.clone(); - let decay: TestDtype = 0.5; + let decay = 0.5; ema1.ema(&model, decay); // check that batchnorm running params aren't updated diff --git a/src/nn/embedding.rs b/src/nn/embedding.rs index 0a109b95b..ec576dac9 100644 --- a/src/nn/embedding.rs +++ b/src/nn/embedding.rs @@ -122,7 +122,7 @@ mod tests { use super::*; use crate::tests::*; - const W: [[TestDtype; 5]; 2] = [ + const W: [[f64; 5]; 2] = [ [-0.3458893, -0.30371523, -0.3712057, 0.14303583, -0.0268966], [0.11733949, 0.14059687, -0.10670426, -0.09373143, 0.18974298], ]; @@ -131,9 +131,9 @@ mod tests { fn test_embedding_initialize() { let dev: TestDevice = Default::default(); let m = dev.build_module::, TestDtype>(); - let bound = 1.0 / (2000.0.sqrt()); + let bound: TestDtype = NumCast::from(1.0 / (2000.0.sqrt())).unwrap(); for v in m.weight.as_vec() { - assert!(-bound <= v && v <= bound && v != 0.0); + assert!(-bound <= v && v <= bound && v != TestDtype::zero()); } } @@ -143,7 +143,8 @@ mod tests { let model = Embedding { weight: dev.tensor(W), - }; + } + .to_dtype::(); let x = dev.tensor([0, 0, 1]); let y = model.forward(x.leaky_trace()); diff --git a/src/nn/generalized_residual.rs b/src/nn/generalized_residual.rs index 0acc3789a..9000c5f11 100644 --- a/src/nn/generalized_residual.rs +++ b/src/nn/generalized_residual.rs @@ -106,9 +106,10 @@ mod tests { let dev: TestDevice = Default::default(); type Model = GeneralizedResidual, Linear<2, 2>>; - let model = dev.build_module::(); + let model = dev.build_module::().to_dtype::(); - let x = dev.sample_normal::>(); + let x: Tensor, f32, _> = dev.sample_normal(); + let x = x.to_dtype::(); let y = model.forward(x.leaky_trace()); #[rustfmt::skip] diff --git a/src/nn/impl_module_for_tuples.rs b/src/nn/impl_module_for_tuples.rs index c267b7566..1dd2cbdd6 100644 --- a/src/nn/impl_module_for_tuples.rs +++ b/src/nn/impl_module_for_tuples.rs @@ -108,7 +108,7 @@ mod tests { fn test_2_tuple_update() { let dev: TestDevice = Default::default(); type Model = (Linear<2, 3>, Linear<3, 4>); - let mut model = Model::build_on_device(&dev); + let mut model = dev.build_module::(); assert_ne!(model.0.weight.array(), [[0.0; 2]; 3]); assert_ne!(model.0.bias.array(), [0.0; 3]); assert_ne!(model.1.weight.array(), [[0.0; 3]; 4]); diff --git a/src/nn/layer_norm.rs b/src/nn/layer_norm.rs index f7559a7a3..3c2f25581 100644 --- a/src/nn/layer_norm.rs +++ b/src/nn/layer_norm.rs @@ -1,5 +1,4 @@ use crate::{shapes::*, tensor::*, tensor_ops::*}; -use num_traits::FromPrimitive; use super::*; @@ -40,7 +39,7 @@ where pub struct LayerNorm1D { pub gamma: Tensor, E, D>, pub beta: Tensor, E, D>, - pub epsilon: E, + pub epsilon: f64, } impl NonMutableModule for LayerNorm1D {} @@ -69,7 +68,7 @@ impl> TensorCollection for LayerNor |(gamma, beta)| LayerNorm1D { gamma, beta, - epsilon: V::E2::from_f32(1e-5).unwrap(), + epsilon: 1e-5, }, ) } @@ -126,19 +125,19 @@ mod tests { let dev: TestDevice = Default::default(); let mut m = dev.build_module::, TestDtype>(); - assert_eq!(m.gamma.array(), [1.0; 5]); - assert_eq!(m.beta.array(), [0.0; 5]); + assert_close_to_literal!(m.gamma, [1.0; 5]); + assert_close_to_literal!(m.beta, [0.0; 5]); m.gamma = dev.sample_normal(); m.beta = dev.sample_normal(); - assert_ne!(m.gamma.array(), [1.0; 5]); - assert_ne!(m.beta.array(), [0.0; 5]); + assert_ne!(m.gamma.array(), [TestDtype::ONE; 5]); + assert_ne!(m.beta.array(), [TestDtype::default(); 5]); m.reset_params(); - assert_eq!(m.gamma.array(), [1.0; 5]); - assert_eq!(m.beta.array(), [0.0; 5]); + assert_close_to_literal!(m.gamma, [1.0; 5]); + assert_close_to_literal!(m.beta, [0.0; 5]); } #[test] diff --git a/src/nn/linear.rs b/src/nn/linear.rs index 3e951ccd8..c76a30a29 100644 --- a/src/nn/linear.rs +++ b/src/nn/linear.rs @@ -154,11 +154,11 @@ mod tests { use super::*; use crate::tests::*; - const W: [[TestDtype; 5]; 2] = [ + const W: [[f64; 5]; 2] = [ [-0.3458893, -0.30371523, -0.3712057, 0.14303583, -0.0268966], [0.11733949, 0.14059687, -0.10670426, -0.09373143, 0.18974298], ]; - const B: [TestDtype; 2] = [0.3765365, -0.290717]; + const B: [f64; 2] = [0.3765365, -0.290717]; #[test] fn test_linear_ondevice() { @@ -173,13 +173,12 @@ mod tests { fn test_linear_initialize() { let dev: TestDevice = Default::default(); let m = dev.build_module::, TestDtype>(); - let bound: TestDtype = 1.0 / 2000.0; - let bound = bound.sqrt(); + let bound: TestDtype = NumCast::from((1.0 / 2000.0f64).sqrt()).unwrap(); for v in m.weight.as_vec() { - assert!(-bound <= v && v <= bound && v != 0.0); + assert!(-bound <= v && v <= bound); } for v in m.bias.as_vec() { - assert!(-bound <= v && v <= bound && v != 0.0); + assert!(-bound <= v && v <= bound); } } @@ -190,9 +189,12 @@ mod tests { let model = Linear { weight: dev.tensor(W), bias: dev.tensor(B), - }; + } + .to_dtype::(); - let x = dev.tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299]); + let x = dev + .tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299]) + .to_dtype::(); let y = model.forward(x.leaky_trace()); assert_close_to_literal!(y, [-0.93430865, 0.08624211]); @@ -214,13 +216,16 @@ mod tests { let model = Linear { weight: dev.tensor(W), bias: dev.tensor(B), - }; - - let x = dev.tensor([ - [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], - [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], - [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455], - ]); + } + .to_dtype::(); + + let x = dev + .tensor([ + [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], + [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], + [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455], + ]) + .to_dtype::(); let y = model.forward(x.leaky_trace()); assert_close_to_literal!( y, @@ -249,13 +254,14 @@ mod tests { let model = Linear { weight: dev.tensor(W), bias: dev.tensor(B), - }; + } + .to_dtype::(); #[rustfmt::skip] let x = dev.tensor([ [[-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455]], [[1.2879219, 0.70150787, -1.6746868, 1.7261779, -0.94021803], [-2.6883178, 2.9369607, 2.9256766, 0.27559614, -0.17530347], [0.17499207, -0.11440835, 0.16627812, -0.91773695, 1.1128315]], - ]); + ]).to_dtype::(); let y = model.forward(x.leaky_trace()); assert_close_to_literal!( y, diff --git a/src/nn/repeated.rs b/src/nn/repeated.rs index dc8a0acb5..63cec4529 100644 --- a/src/nn/repeated.rs +++ b/src/nn/repeated.rs @@ -97,8 +97,11 @@ mod tests { let m = dev.build_module::(); for i in 0..5 { - assert_ne!(m.modules[i].0.weight.array(), [[0.0; 3]; 3]); - assert_ne!(m.modules[i].0.bias.array(), [0.0; 3]); + assert_ne!( + m.modules[i].0.weight.array(), + [[TestDtype::default(); 3]; 3] + ); + assert_ne!(m.modules[i].0.bias.array(), [TestDtype::default(); 3]); } } diff --git a/src/nn/residual.rs b/src/nn/residual.rs index 40c6f3ac2..fbf7c6c78 100644 --- a/src/nn/residual.rs +++ b/src/nn/residual.rs @@ -71,17 +71,20 @@ mod tests { fn test_residual_reset() { let dev: TestDevice = Default::default(); let model = dev.build_module::>, TestDtype>(); - assert_ne!(model.0.weight.array(), [[0.0; 2]; 5]); - assert_ne!(model.0.bias.array(), [0.0; 5]); + assert_ne!(model.0.weight.array(), [[TestDtype::default(); 2]; 5]); + assert_ne!(model.0.bias.array(), [TestDtype::default(); 5]); } #[test] fn test_residual_gradients() { let dev: TestDevice = Default::default(); - let model = >>::build_on_device(&dev); + let model = dev + .build_module::>, f32>() + .to_dtype::(); - let x: Tensor, f32, TestDevice> = dev.sample_normal(); + let x: Tensor, f32, _> = dev.sample_normal(); + let x = x.to_dtype::(); let y = model.forward(x.leaky_trace()); #[rustfmt::skip] diff --git a/src/nn/split_into.rs b/src/nn/split_into.rs index fa8343f48..b3d7a2370 100644 --- a/src/nn/split_into.rs +++ b/src/nn/split_into.rs @@ -109,8 +109,8 @@ mod tests { let gr = right.mean().backward(); let l = left.retaped::(); let gl = left.mean().backward(); - assert_ne!(gl.get(&l).array(), [0.0; 1]); - assert_ne!(gr.get(&r).array(), [0.0; 1]); + assert_ne!(gl.get(&l).array(), [TestDtype::zero(); 1]); + assert_ne!(gr.get(&r).array(), [TestDtype::zero(); 1]); } #[test] diff --git a/src/nn/unbiased_linear.rs b/src/nn/unbiased_linear.rs index 239ea5799..d94acf047 100644 --- a/src/nn/unbiased_linear.rs +++ b/src/nn/unbiased_linear.rs @@ -96,7 +96,7 @@ mod tests { use super::*; use crate::tests::*; - const W: [[TestDtype; 5]; 2] = [ + const W: [[f64; 5]; 2] = [ [-0.3458893, -0.30371523, -0.3712057, 0.14303583, -0.0268966], [0.11733949, 0.14059687, -0.10670426, -0.09373143, 0.18974298], ]; @@ -116,10 +116,9 @@ mod tests { fn test_unbiased_linear_initialize() { let dev: TestDevice = Default::default(); let m = dev.build_module::, TestDtype>(); - let bound: TestDtype = 1.0 / 2000.0; - let bound = bound.sqrt(); + let bound: TestDtype = NumCast::from((1.0 / 2000.0).sqrt()).unwrap(); for v in m.weight.as_vec() { - assert!(-bound <= v && v <= bound && v != 0.0); + assert!(-bound <= v && v <= bound && v != TestDtype::zero()); } } @@ -129,9 +128,12 @@ mod tests { let model = UnbiasedLinear { weight: dev.tensor(W), - }; + } + .to_dtype::(); - let x = dev.tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299]); + let x = dev + .tensor([-0.8808001, 2.4185333, 2.2478335, 0.0565211, 2.031299]) + .to_dtype::(); let y = model.forward(x.leaky_trace()); assert_close_to_literal!(y, [-1.3108451, 0.37695912]); @@ -151,13 +153,16 @@ mod tests { let model = UnbiasedLinear { weight: dev.tensor(W), - }; - - let x = dev.tensor([ - [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], - [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], - [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455], - ]); + } + .to_dtype::(); + + let x = dev + .tensor([ + [-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], + [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], + [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455], + ]) + .to_dtype::(); let y = model.forward(x.leaky_trace()); assert_close_to_literal!( y, @@ -190,13 +195,14 @@ mod tests { let model = UnbiasedLinear { weight: dev.tensor(W), - }; + } + .to_dtype::(); #[rustfmt::skip] let x = dev.tensor([ [[-1.9468665, 1.4611785, -1.6698982, 1.408863, 1.3425643], [-1.3399831, 3.0510678, -0.17936817, -0.04943254, -0.8052705], [-0.8291412, 0.07691376, -0.26538327, 0.90017676, -1.8790455]], [[1.2879219, 0.70150787, -1.6746868, 1.7261779, -0.94021803], [-2.6883178, 2.9369607, 2.9256766, 0.27559614, -0.17530347], [0.17499207, -0.11440835, 0.16627812, -0.91773695, 1.1128315]], - ]); + ]).to_dtype::(); let y = model.forward(x.leaky_trace()); assert_close_to_literal!( y, diff --git a/src/nn/zero_grads.rs b/src/nn/zero_grads.rs index f278b5aa1..891361562 100644 --- a/src/nn/zero_grads.rs +++ b/src/nn/zero_grads.rs @@ -98,10 +98,13 @@ mod tests { grads.get_or_alloc_mut(&tmp2).unwrap(); model.zero_grads(&mut grads); - assert_eq!(grads.get(&model.0.weight).array(), [[0.0; 2]; 5]); - assert_eq!(grads.get(&model.0.bias).array(), [0.0; 5]); - assert_eq!(grads.get(&model.1.scale).array(), [0.0; 3]); - assert_eq!(grads.get(&model.1.bias).array(), [0.0; 3]); + assert_eq!( + grads.get(&model.0.weight).array(), + [[TestDtype::zero(); 2]; 5] + ); + assert_eq!(grads.get(&model.0.bias).array(), [TestDtype::zero(); 5]); + assert_eq!(grads.get(&model.1.scale).array(), [TestDtype::zero(); 3]); + assert_eq!(grads.get(&model.1.bias).array(), [TestDtype::zero(); 3]); assert!(grads.get_ref_checked(&model.1.running_mean).is_none()); assert!(grads.get_ref_checked(&model.1.running_var).is_none()); assert!(grads.get_ref_checked(&tmp1).is_none()); diff --git a/src/optim/adam/adam.cu b/src/optim/adam/adam.cu index 1a908d7a5..3b1dcf9e0 100644 --- a/src/optim/adam/adam.cu +++ b/src/optim/adam/adam.cu @@ -6,21 +6,20 @@ enum WeightDecayType { Decoupled }; -template struct AdamConfig { - T lr; - T beta1; - T beta2; - T eps; + double lr; + double beta1; + double beta2; + double eps; WeightDecayType weight_decay_type; - T weight_decay; + double weight_decay; }; template __device__ void adam_update( - const AdamConfig cfg, + const AdamConfig cfg, const size_t numel, - const T t, + const int t_int, T* param, T* moment1, T* moment2, @@ -32,23 +31,31 @@ __device__ void adam_update( return; } + T beta1 = cfg.beta1; + T beta2 = cfg.beta2; + T lr = cfg.lr; + T weight_decay = cfg.weight_decay; + T eps = cfg.eps; + T p = param[i]; T g = grad[i]; T m = moment1[i]; T v = moment2[i]; + T one = 1.0; + T t = t_int; if (cfg.weight_decay_type == L2) { - g += cfg.weight_decay * p; + g += weight_decay * p; } - m = m * cfg.beta1 + g * (1.0 - cfg.beta1); - v = v * cfg.beta2 + g * g * (1.0 - cfg.beta2); - T m_hat = m * 1.0 / (1.0 - powg(cfg.beta1, t)); - T v_hat = v * 1.0 / (1.0 - powg(cfg.beta2, t)); - g = cfg.lr * m_hat / (sqrtg(v_hat) + cfg.eps); + m = m * beta1 + g * (one - beta1); + v = v * beta2 + g * g * (one - beta2); + T m_hat = m * one / (one - powg(beta1, t)); + T v_hat = v * one / (one - powg(beta2, t)); + g = lr * m_hat / (sqrtg(v_hat) + eps); if (cfg.weight_decay_type == Decoupled) { - g += cfg.weight_decay * cfg.lr * p; + g += (weight_decay * lr) * p; } moment1[i] = m; @@ -58,9 +65,9 @@ __device__ void adam_update( #define ADAM(TYPENAME, FN) \ extern "C" __global__ void FN( \ - const AdamConfig cfg, \ + const AdamConfig cfg, \ const size_t numel, \ - const TYPENAME t, \ + const int t, \ TYPENAME* param, \ TYPENAME* moment1, \ TYPENAME* moment2, \ @@ -69,5 +76,6 @@ extern "C" __global__ void FN( \ adam_update(cfg, numel, t, param, moment1, moment2, grad); \ } +ADAM(__half, adam_update_f16); ADAM(float, adam_update_f32); ADAM(double, adam_update_f64); diff --git a/src/optim/adam/cpu_kernel.rs b/src/optim/adam/cpu_kernel.rs index 32b59bd98..42632b479 100644 --- a/src/optim/adam/cpu_kernel.rs +++ b/src/optim/adam/cpu_kernel.rs @@ -5,29 +5,33 @@ impl AdamKernel for Cpu { fn update( &self, t: i32, - cfg: &AdamConfig, + cfg: &AdamConfig, param: &mut Self::Vec, moment1: &mut Self::Vec, moment2: &mut Self::Vec, grad: &Self::Vec, ) -> Result<(), Self::Err> { + let betas = cfg.betas.map(E::from_f64).map(Option::unwrap); + let eps = E::from_f64(cfg.eps).unwrap(); + let lr = E::from_f64(cfg.lr).unwrap(); + for ((p, mut g), (m, v)) in param .iter_mut() .zip(grad.iter().cloned()) .zip(moment1.iter_mut().zip(moment2.iter_mut())) { if let Some(WeightDecay::L2(wd)) = cfg.weight_decay { - g += wd * *p; + g += E::from_f64(wd).unwrap() * *p; } - *m = *m * cfg.betas[0] + g * (E::one() - cfg.betas[0]); - *v = *v * cfg.betas[1] + g.powi(2) * (E::one() - cfg.betas[1]); - let m_hat = *m * (E::one() - cfg.betas[0].powi(t)).recip(); - let v_hat = *v * (E::one() - cfg.betas[1].powi(t)).recip(); - g = cfg.lr * m_hat / (v_hat.sqrt() + cfg.eps); + *m = *m * betas[0] + g * (E::one() - betas[0]); + *v = *v * betas[1] + g.powi(2) * (E::one() - betas[1]); + let m_hat = *m * (E::one() - betas[0].powi(t)).recip(); + let v_hat = *v * (E::one() - betas[1].powi(t)).recip(); + g = lr * m_hat / (v_hat.sqrt() + eps); if let Some(WeightDecay::Decoupled(wd)) = cfg.weight_decay { - g += wd * cfg.lr * *p; + g += E::from_f64(wd * cfg.lr).unwrap() * *p; } *p -= g; diff --git a/src/optim/adam/cuda_kernel.rs b/src/optim/adam/cuda_kernel.rs index ef1c9b132..ac06c2e13 100644 --- a/src/optim/adam/cuda_kernel.rs +++ b/src/optim/adam/cuda_kernel.rs @@ -7,18 +7,18 @@ use crate::{ use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync}; #[repr(C)] -struct CudaAdamConfig { - lr: E, - beta1: E, - beta2: E, - eps: E, +struct CudaAdamConfig { + lr: f64, + beta1: f64, + beta2: f64, + eps: f64, weight_decay_type: WeightDecayType, - weight_decay: E, + weight_decay: f64, } -unsafe impl DeviceRepr for CudaAdamConfig {} +unsafe impl DeviceRepr for CudaAdamConfig {} -fn adam_config_to_cuda(config: &super::AdamConfig) -> CudaAdamConfig { +fn adam_config_to_cuda(config: &super::AdamConfig) -> CudaAdamConfig { let (weight_decay_type, weight_decay) = weight_decay_to_cuda(config.weight_decay); CudaAdamConfig { @@ -38,6 +38,12 @@ trait HasCudaKernel { const FWD: &'static str; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "adam_f16"; + const FWD: &'static str = "adam_update_f16"; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "adam_f32"; const FWD: &'static str = "adam_update_f32"; @@ -55,7 +61,7 @@ where fn update( &self, t: i32, - cfg: &super::AdamConfig, + cfg: &super::AdamConfig, param: &mut Self::Vec, moment1: &mut Self::Vec, moment2: &mut Self::Vec, @@ -69,7 +75,6 @@ where let numel = param.len(); let func = self.dev.get_func(Self::MOD, Self::FWD).unwrap(); let cfg = launch_cfg::<128>(numel as u32); - let t = ::from_i32(t).unwrap(); let params = (opt_cfg, numel, t, param, moment1, moment2, grad); unsafe { func.launch(cfg, params) }?; Ok(()) diff --git a/src/optim/adam/mod.rs b/src/optim/adam/mod.rs index b8f80f3a7..2f0ed5d32 100644 --- a/src/optim/adam/mod.rs +++ b/src/optim/adam/mod.rs @@ -27,26 +27,26 @@ use super::{Optimizer, OptimizerUpdateError, UnusedTensors, WeightDecay}; /// }; /// ``` #[derive(Debug, Clone, Copy)] -pub struct AdamConfig { +pub struct AdamConfig { /// Learning rate. Defaults to `1e-3`. - pub lr: E, + pub lr: f64, /// Betas from Adam paper. Defaults to `[0.9, 0.999]`. - pub betas: [E; 2], + pub betas: [f64; 2], /// Epsilon for numerical stability. Defaults to `1e-8`. - pub eps: E, + pub eps: f64, /// Optional weight decay. Defaults to `None`. - pub weight_decay: Option>, + pub weight_decay: Option, } -impl Default for AdamConfig { +impl Default for AdamConfig { fn default() -> Self { Self { - lr: E::from_f32(1e-3).unwrap(), - betas: [E::from_f32(0.9).unwrap(), E::from_f32(0.999).unwrap()], - eps: E::from_f32(1e-8).unwrap(), + lr: 1e-3, + betas: [0.9, 0.999], + eps: 1e-8, weight_decay: None, } } @@ -73,7 +73,7 @@ impl Default for AdamConfig { #[derive(Debug)] pub struct Adam { /// Hyperparameter configuration - pub cfg: AdamConfig, + pub cfg: AdamConfig, t: i32, moment1: Gradients, @@ -84,7 +84,7 @@ pub struct Adam { impl Adam { /// Constructs using hyperparameters from `cfg`. - pub fn new(_model: &M, cfg: AdamConfig) -> Self { + pub fn new(_model: &M, cfg: AdamConfig) -> Self { Self { cfg, t: 0, @@ -99,7 +99,7 @@ pub trait AdamKernel: DeviceStorage { fn update( &self, t: i32, - cfg: &AdamConfig, + cfg: &AdamConfig, param: &mut Self::Vec, moment1: &mut Self::Vec, moment2: &mut Self::Vec, @@ -173,7 +173,9 @@ mod tests { let dev: TestDevice = Default::default(); let mut t: Tensor, TestDtype, _> = dev.ones(); let mut opt = Adam::new(&t, Default::default()); - let rate = dev.tensor([1e-6, 1e-5, 1e-4, 1e-3, 1e-2]); + let rate = dev + .tensor([1e-6, 1e-5, 1e-4, 1e-3, 1e-2]) + .to_dtype::(); let expected = [ [0.99999994, 0.999996, 0.9997143, 0.9990244, 0.99900025], [0.9999999, 0.999992, 0.99942863, 0.99804884, 0.9980005], @@ -207,7 +209,9 @@ mod tests { weight_decay: None, }, ); - let rate = dev.tensor([1e-4, 1e-3, 1e-2, 1e-1, 1e-0]); + let rate = dev + .tensor([1e-4, 1e-3, 1e-2, 1e-1, 1e-0]) + .to_dtype::(); let expected = [ [0.9997143, 0.9990244, 0.99900025, 0.999, 0.999], [0.99942863, 0.99804866, 0.9980004, 0.9979999, 0.9979999], @@ -231,7 +235,9 @@ mod tests { #[test] fn test_adam_l2_decay() { let dev: TestDevice = Default::default(); - let mut t: Tensor, TestDtype, _> = dev.tensor([-0.5, -0.25, 0.1, 0.6, 1.0]); + let mut t = dev + .tensor([-0.5, -0.25, 0.1, 0.6, 1.0]) + .to_dtype::(); let mut opt = Adam::new( &t, AdamConfig { @@ -264,7 +270,9 @@ mod tests { #[test] fn test_adam_decoupled_decay() { let dev: TestDevice = Default::default(); - let mut t: Tensor, TestDtype, _> = dev.tensor([-0.5, -0.25, 0.1, 0.6, 1.0]); + let mut t = dev + .tensor([-0.5, -0.25, 0.1, 0.6, 1.0]) + .to_dtype::(); let mut opt = Adam::new( &t, AdamConfig { diff --git a/src/optim/optimizer.rs b/src/optim/optimizer.rs index 5e41a7336..e37b630b1 100644 --- a/src/optim/optimizer.rs +++ b/src/optim/optimizer.rs @@ -4,14 +4,14 @@ use crate::{ }; /// L2 and decoupled regularization methods -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum WeightDecay { +#[derive(Debug, Clone, Copy)] +pub enum WeightDecay { /// Weight decay applied to the gradients before any momentum updates. Equivalent to L2 regularization. - L2(E), + L2(f64), /// Weight decay applied after any momentum updates, without modifying the gradients. /// See [Decoupled Weight Decay Regularization](https://arxiv.org/abs/1711.05101) - Decoupled(E), + Decoupled(f64), } /// Used to communicate the "WeightDecay" enum to cuda kernels @@ -25,7 +25,7 @@ pub(super) enum WeightDecayType { } #[cfg(feature = "cuda")] -pub(super) fn weight_decay_to_cuda(wd: Option>) -> (WeightDecayType, E) { +pub(super) fn weight_decay_to_cuda(wd: Option) -> (WeightDecayType, f64) { match wd { None => (WeightDecayType::None, Default::default()), Some(WeightDecay::L2(x)) => (WeightDecayType::L2, x), @@ -34,13 +34,13 @@ pub(super) fn weight_decay_to_cuda(wd: Option>) -> (W } /// Momentum used for [super::Sgd] and others -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Momentum { +#[derive(Debug, Clone, Copy)] +pub enum Momentum { /// Momentum that is applied to the velocity of a parameter directly. - Classic(E), + Classic(f64), /// Momentum that is applied to both velocity and gradients. See [super::Sgd] nesterov paper for more. - Nesterov(E), + Nesterov(f64), } /// Used to communicate the "Momentum" enum to cuda kernels @@ -54,7 +54,7 @@ pub(super) enum MomentumType { } #[cfg(feature = "cuda")] -pub(super) fn momentum_to_cuda(wd: Option>) -> (MomentumType, E) { +pub(super) fn momentum_to_cuda(wd: Option) -> (MomentumType, f64) { match wd { None => (MomentumType::None, Default::default()), Some(Momentum::Classic(x)) => (MomentumType::Classic, x), diff --git a/src/optim/rmsprop/cpu_kernel.rs b/src/optim/rmsprop/cpu_kernel.rs index 82bff2ded..4ebf61c21 100644 --- a/src/optim/rmsprop/cpu_kernel.rs +++ b/src/optim/rmsprop/cpu_kernel.rs @@ -5,47 +5,50 @@ use super::{RMSpropConfig, RMSpropKernel}; impl RMSpropKernel for Cpu { fn update( &self, - cfg: &RMSpropConfig, + cfg: &RMSpropConfig, param: &mut Self::Vec, momentum: &mut Self::Vec, square_avg: &mut Self::Vec, grad_avg: &mut Self::Vec, grad: &Self::Vec, ) -> Result<(), Self::Err> { + let alpha = E::from_f64(cfg.alpha).unwrap(); + let eps = E::from_f64(cfg.eps).unwrap(); + let lr = E::from_f64(cfg.lr).unwrap(); for ((p, mut g), (s_avg, (g_avg, m))) in param.iter_mut().zip(grad.iter().cloned()).zip( square_avg .iter_mut() .zip(grad_avg.iter_mut().zip(momentum.iter_mut())), ) { if let Some(WeightDecay::L2(wd)) = cfg.weight_decay { - g += wd * *p; + g += E::from_f64(wd).unwrap() * *p; } // sa = a * sa + (1 - a) * g^2 - *s_avg += (E::one() - cfg.alpha) * (g * g - *s_avg); + *s_avg += (E::one() - alpha) * (g * g - *s_avg); let avg = if cfg.centered { // ga = a * ga + (1 - a) * g - *g_avg += (E::one() - cfg.alpha) * (g - *g_avg); - // NOTE: cfg.eps in sqrt - (*s_avg - g_avg.powi(2) + cfg.eps).sqrt() + *g_avg += (E::one() - alpha) * (g - *g_avg); + // NOTE: eps in sqrt + (*s_avg - g_avg.powi(2) + eps).sqrt() } else { - // NOTE: cfg.eps in sqrt - (*s_avg + cfg.eps).sqrt() + // NOTE: eps in sqrt + (*s_avg + eps).sqrt() }; g /= avg; match cfg.momentum { Some(u) => { - *m = *m * u + g; - g = *m * cfg.lr; + *m = *m * E::from_f64(u).unwrap() + g; + g = *m * lr; } - None => g *= cfg.lr, + None => g *= lr, } if let Some(WeightDecay::Decoupled(wd)) = cfg.weight_decay { - g += wd * cfg.lr * *p; + g += E::from_f64(wd * cfg.lr).unwrap() * *p; } *p -= g; diff --git a/src/optim/rmsprop/cuda_kernel.rs b/src/optim/rmsprop/cuda_kernel.rs index a87430d24..c5a577dae 100644 --- a/src/optim/rmsprop/cuda_kernel.rs +++ b/src/optim/rmsprop/cuda_kernel.rs @@ -8,20 +8,20 @@ use crate::{ use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync}; #[repr(C)] -struct CudaRMSpropConfig { - lr: E, - alpha: E, - eps: E, +struct CudaRMSpropConfig { + lr: f64, + alpha: f64, + eps: f64, centered: bool, has_momentum: bool, - momentum: E, + momentum: f64, weight_decay_type: WeightDecayType, - weight_decay: E, + weight_decay: f64, } -unsafe impl DeviceRepr for CudaRMSpropConfig {} +unsafe impl DeviceRepr for CudaRMSpropConfig {} -fn rmsprop_config_to_cuda(config: &RMSpropConfig) -> CudaRMSpropConfig { +fn rmsprop_config_to_cuda(config: &RMSpropConfig) -> CudaRMSpropConfig { let (weight_decay_type, weight_decay) = weight_decay_to_cuda(config.weight_decay); let (has_momentum, momentum) = if let Some(m) = config.momentum { (true, m) @@ -48,6 +48,12 @@ trait HasCudaKernel { const FWD: &'static str; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "rmsprop_f16"; + const FWD: &'static str = "rmsprop_update_f16"; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "rmsprop_f32"; const FWD: &'static str = "rmsprop_update_f32"; @@ -64,7 +70,7 @@ where { fn update( &self, - cfg: &RMSpropConfig, + cfg: &RMSpropConfig, param: &mut Self::Vec, momentum: &mut Self::Vec, square_avg: &mut Self::Vec, diff --git a/src/optim/rmsprop/mod.rs b/src/optim/rmsprop/mod.rs index df435c7d9..3c170cd3f 100644 --- a/src/optim/rmsprop/mod.rs +++ b/src/optim/rmsprop/mod.rs @@ -16,33 +16,33 @@ use super::{Optimizer, OptimizerUpdateError, UnusedTensors, WeightDecay}; /// Configuration of hyperparameters for [RMSprop]. #[derive(Debug, Clone, Copy)] -pub struct RMSpropConfig { +pub struct RMSpropConfig { /// Learning rate. Defaults to `1e-2`. - pub lr: E, + pub lr: f64, /// Value for exponential moving average. Defaults to `0.9`. - pub alpha: E, + pub alpha: f64, /// Epsilon for stability. Defaults to `1e-8`. - pub eps: E, + pub eps: f64, /// Optional momentum. Defaults to `None`. - pub momentum: Option, + pub momentum: Option, /// Whether the avg should be centered by the grad's avg value. /// Defaults to `false`. pub centered: bool, /// Optional weight decay. Defaults to `None`. - pub weight_decay: Option>, + pub weight_decay: Option, } -impl Default for RMSpropConfig { +impl Default for RMSpropConfig { fn default() -> Self { Self { - lr: E::from_f32(1e-2).unwrap(), - alpha: E::from_f32(0.9).unwrap(), - eps: E::from_f32(1e-8).unwrap(), + lr: 1e-2, + alpha: 0.9, + eps: 1e-8, momentum: None, centered: false, weight_decay: None, @@ -80,7 +80,7 @@ impl Default for RMSpropConfig { #[derive(Debug)] pub struct RMSprop { /// Hyperparameter configuration - pub cfg: RMSpropConfig, + pub cfg: RMSpropConfig, step: usize, momentums: Gradients, @@ -92,7 +92,7 @@ pub struct RMSprop { impl RMSprop { /// Constructs using hyperparameters from `cfg`. - pub fn new(_model: &M, cfg: RMSpropConfig) -> Self { + pub fn new(_model: &M, cfg: RMSpropConfig) -> Self { Self { cfg, step: 0, @@ -107,7 +107,7 @@ impl RMSprop { pub trait RMSpropKernel: DeviceStorage { fn update( &self, - cfg: &RMSpropConfig, + cfg: &RMSpropConfig, param: &mut Self::Vec, momentum: &mut Self::Vec, square_avg: &mut Self::Vec, @@ -186,9 +186,11 @@ mod tests { use super::*; use crate::{shapes::*, tensor_ops::*, tests::*}; - fn test_matches_expected(cfg: RMSpropConfig, expected: [[f64; 5]; 5]) { + fn test_matches_expected(cfg: RMSpropConfig, expected: [[f64; 5]; 5]) { let dev: TestDevice = Default::default(); - let rate: Tensor<_, TestDtype, _> = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let mut t: Tensor, TestDtype, _> = dev.ones(); let mut opt = RMSprop::new(&t, cfg); for e in expected.iter() { diff --git a/src/optim/rmsprop/rmsprop.cu b/src/optim/rmsprop/rmsprop.cu index 836121874..0beb5b4bb 100644 --- a/src/optim/rmsprop/rmsprop.cu +++ b/src/optim/rmsprop/rmsprop.cu @@ -6,21 +6,20 @@ enum WeightDecayType { Decoupled }; -template struct RMSpropConfig { - T lr; - T alpha; - T eps; + double lr; + double alpha; + double eps; bool centered; bool has_momentum; - T momentum; + double momentum; WeightDecayType weight_decay_type; - T weight_decay; + double weight_decay; }; template __device__ void rmsprop_update( - const RMSpropConfig cfg, + const RMSpropConfig cfg, const size_t numel, T* param, T* momentum, @@ -34,39 +33,46 @@ __device__ void rmsprop_update( return; } + T lr = cfg.lr; + T alpha = cfg.alpha; + T eps = cfg.eps; + T momentum_ = cfg.momentum; + T weight_decay = cfg.weight_decay; + T p = param[i]; T g = grad[i]; T s_avg = square_avg[i]; T g_avg = grad_avg[i]; T m = momentum[i]; + T one = 1.0; if (cfg.weight_decay_type == L2) { - g += cfg.weight_decay * p; + g += weight_decay * p; } - s_avg += (1.0 - cfg.alpha) * (g * g - s_avg); + s_avg += (one - alpha) * (g * g - s_avg); T avg; if (cfg.centered) { // ga = a * ga + (1 - a) * g - g_avg += (1.0 - cfg.alpha) * (g - g_avg); - avg = sqrtg(s_avg - g_avg * g_avg + cfg.eps); + g_avg += (one - alpha) * (g - g_avg); + avg = sqrtg(s_avg - g_avg * g_avg + eps); } else { - avg = sqrtg(s_avg + cfg.eps); + avg = sqrtg(s_avg + eps); }; g /= avg; if (cfg.has_momentum) { - m = m * cfg.momentum + g; - g = m * cfg.lr; + m = m * momentum_ + g; + g = m * lr; } else { - g *= cfg.lr; + g *= lr; } if (cfg.weight_decay_type == Decoupled) { - g += cfg.weight_decay * cfg.lr * p; + g += weight_decay * lr * p; } square_avg[i] = s_avg; @@ -77,7 +83,7 @@ __device__ void rmsprop_update( #define RMSPROP(TYPENAME, FN) \ extern "C" __global__ void FN( \ - const RMSpropConfig cfg, \ + const RMSpropConfig cfg, \ const size_t numel, \ TYPENAME* param, \ TYPENAME* momentum, \ @@ -88,5 +94,6 @@ extern "C" __global__ void FN( \ rmsprop_update(cfg, numel, param, momentum, square_avg, grad_avg, grad); \ } +RMSPROP(__half, rmsprop_update_f16); RMSPROP(float, rmsprop_update_f32); RMSPROP(double, rmsprop_update_f64); diff --git a/src/optim/sgd/cpu_kernel.rs b/src/optim/sgd/cpu_kernel.rs index cb1584f0c..a633211a1 100644 --- a/src/optim/sgd/cpu_kernel.rs +++ b/src/optim/sgd/cpu_kernel.rs @@ -9,34 +9,39 @@ use super::{SgdConfig, SgdKernel}; impl SgdKernel for Cpu { fn update( &self, - cfg: &SgdConfig, + cfg: &SgdConfig, param: &mut Self::Vec, velocity: &mut Self::Vec, grad: &Self::Vec, ) -> Result<(), Self::Err> { + let lr = E::from_f64(cfg.lr).unwrap(); + for ((p, mut g), v) in param .iter_mut() .zip(grad.iter().cloned()) .zip(velocity.iter_mut()) { if let Some(WeightDecay::L2(wd)) = cfg.weight_decay { + let wd = E::from_f64(wd).unwrap(); g += wd * *p; } match cfg.momentum { Some(Momentum::Classic(u)) => { + let u = E::from_f64(u).unwrap(); *v = g + u * *v; - g = *v * cfg.lr; + g = *v * lr; } Some(Momentum::Nesterov(u)) => { + let u = E::from_f64(u).unwrap(); *v = g + u * *v; - g = (g + u * *v) * cfg.lr; + g = (g + u * *v) * lr; } - None => g *= cfg.lr, + None => g *= lr, } if let Some(WeightDecay::Decoupled(wd)) = cfg.weight_decay { - g += wd * cfg.lr * *p; + g += E::from_f64(wd * cfg.lr).unwrap() * *p; } *p -= g; diff --git a/src/optim/sgd/cuda_kernel.rs b/src/optim/sgd/cuda_kernel.rs index 0e3d37097..0c0b3e12a 100644 --- a/src/optim/sgd/cuda_kernel.rs +++ b/src/optim/sgd/cuda_kernel.rs @@ -8,17 +8,17 @@ use crate::{ use cudarc::driver::{DeviceRepr, DeviceSlice, LaunchAsync}; #[repr(C)] -struct CudaSgdConfig { - lr: E, +struct CudaSgdConfig { + lr: f64, momentum_type: MomentumType, - momentum: E, + momentum: f64, weight_decay_type: WeightDecayType, - weight_decay: E, + weight_decay: f64, } -unsafe impl DeviceRepr for CudaSgdConfig {} +unsafe impl DeviceRepr for CudaSgdConfig {} -fn sgd_config_to_cuda(config: &SgdConfig) -> CudaSgdConfig { +fn sgd_config_to_cuda(config: &SgdConfig) -> CudaSgdConfig { let (momentum_type, momentum) = momentum_to_cuda(config.momentum); let (weight_decay_type, weight_decay) = weight_decay_to_cuda(config.weight_decay); @@ -38,6 +38,12 @@ trait HasCudaKernel { const FWD: &'static str; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "sgd_f16"; + const FWD: &'static str = "sgd_update_f16"; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "sgd_f32"; const FWD: &'static str = "sgd_update_f32"; @@ -54,7 +60,7 @@ where { fn update( &self, - cfg: &SgdConfig, + cfg: &SgdConfig, param: &mut Self::Vec, velocity: &mut Self::Vec, grad: &Self::Vec, diff --git a/src/optim/sgd/mod.rs b/src/optim/sgd/mod.rs index be945d4c5..248548d5a 100644 --- a/src/optim/sgd/mod.rs +++ b/src/optim/sgd/mod.rs @@ -66,21 +66,21 @@ use super::optimizer::*; /// }; /// ``` #[derive(Debug, Clone, Copy)] -pub struct SgdConfig { +pub struct SgdConfig { /// Learning rate. Defaults to `1e-2` - pub lr: E, + pub lr: f64, /// Optional momentum. Defaults to `None`. - pub momentum: Option>, + pub momentum: Option, /// Optional weight decay. Defaults to `None`. - pub weight_decay: Option>, + pub weight_decay: Option, } -impl Default for SgdConfig { +impl Default for SgdConfig { fn default() -> Self { Self { - lr: E::from_f32(1e-2).unwrap(), + lr: 1e-2, momentum: None, weight_decay: None, } @@ -114,7 +114,7 @@ impl Default for SgdConfig { #[derive(Debug)] pub struct Sgd { /// Hyperparameter configuration - pub cfg: SgdConfig, + pub cfg: SgdConfig, velocity: Gradients, @@ -123,7 +123,7 @@ pub struct Sgd { impl Sgd { /// Constructs using hyperparameters from `cfg` - pub fn new(_model: &M, cfg: SgdConfig) -> Self { + pub fn new(_model: &M, cfg: SgdConfig) -> Self { Self { cfg, velocity: Gradients::leaky(), @@ -135,7 +135,7 @@ impl Sgd { pub trait SgdKernel: DeviceStorage { fn update( &self, - cfg: &SgdConfig, + cfg: &SgdConfig, param: &mut Self::Vec, velocity: &mut Self::Vec, grad: &Self::Vec, @@ -228,7 +228,9 @@ mod tests { let mut t: Tensor, TestDtype, _> = dev.ones(); let mut sgd = Sgd::new(&t, Default::default()); - let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let expected = [ [0.9998, 0.998, 0.996, 0.98, 0.8], [0.99960005, 0.99600005, 0.992, 0.96000004, 0.6], @@ -258,7 +260,9 @@ mod tests { }, ); - let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let expected = [ [0.9998, 0.998, 0.996, 0.98, 0.8], [0.99950004, 0.995, 0.99, 0.95000005, 0.5], @@ -288,7 +292,9 @@ mod tests { }, ); - let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let expected = [ [0.9997, 0.997, 0.994, 0.97, 0.70000005], [0.99935, 0.9935, 0.987, 0.935, 0.35000005], @@ -327,7 +333,9 @@ mod tests { }, ); - let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let expected = [ [0.9988, 0.997, 0.995, 0.979, 0.799], [0.99760115, 0.994003, 0.990005, 0.958021, 0.59820104], @@ -362,7 +370,9 @@ mod tests { }, ); - let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let expected = [ [0.9988, 0.997, 0.995, 0.979, 0.799], [0.9975012, 0.993003, 0.988005, 0.948021, 0.498201], @@ -382,14 +392,14 @@ mod tests { let dev: TestDevice = Default::default(); // adding l2_weight_decay should be equivalent to adding an L2 term to the loss - let weight_decay = 1e-1; + let mut t: Tensor, TestDtype, _> = dev.ones(); let mut sgd_l2 = Sgd::new( &t, SgdConfig { lr: 1e-2, momentum: Some(Momentum::Classic(0.5)), - weight_decay: Some(WeightDecay::L2(weight_decay)), + weight_decay: Some(WeightDecay::L2(1e-1)), }, ); let mut sgd = Sgd::new( @@ -401,7 +411,9 @@ mod tests { }, ); - let rate = dev.tensor([0.1, 1.0, 2.0, 10.0, 100.0]); + let rate = dev + .tensor([0.1, 1.0, 2.0, 10.0, 100.0]) + .to_dtype::(); let expected = [ [0.9988, 0.997, 0.995, 0.979, 0.799], [0.9970012, 0.992503, 0.987505, 0.947521, 0.49770102], @@ -419,7 +431,8 @@ mod tests { t = dev.ones(); for e in expected.iter() { let normal_loss = (t.leaky_trace() * rate.clone()).mean(); - let l2_loss = t.leaky_trace().powi(2).sum() * (weight_decay / (2.0)); + let scale: TestDtype = NumCast::from(1e-1 / 2.0).unwrap(); + let l2_loss = t.leaky_trace().powi(2).sum() * scale; let loss = l2_loss + normal_loss; let gradients = loss.backward(); diff --git a/src/optim/sgd/sgd.cu b/src/optim/sgd/sgd.cu index ce33c00b9..226930011 100644 --- a/src/optim/sgd/sgd.cu +++ b/src/optim/sgd/sgd.cu @@ -1,3 +1,5 @@ +#include "cuda_fp16.h" + enum MomentumType { None, Classic, @@ -10,18 +12,17 @@ enum WeightDecayType { Decoupled }; -template struct SgdConfig { - T lr; + double lr; MomentumType momentum_type; - T momentum; + double momentum; WeightDecayType weight_decay_type; - T weight_decay; + double weight_decay; }; template __device__ void sgd_update( - const SgdConfig cfg, + const SgdConfig cfg, const size_t numel, T* param, T* velocity, @@ -33,26 +34,30 @@ __device__ void sgd_update( return; } + T weight_decay = cfg.weight_decay; + T lr = cfg.lr; + T momentum = cfg.momentum; + T p = param[i]; T g = grad[i]; T v = velocity[i]; if (cfg.weight_decay_type == L2) { - g += cfg.weight_decay * p; + g += weight_decay * p; } if (cfg.momentum_type == Classic) { - v = g + cfg.momentum * v; - g = v * cfg.lr; + v = g + momentum * v; + g = v * lr; } else if (cfg.momentum_type == Nesterov) { - v = g + cfg.momentum * v; - g = (g + cfg.momentum * v) * cfg.lr; + v = g + momentum * v; + g = (g + momentum * v) * lr; } else { - g *= cfg.lr; + g *= lr; } if (cfg.weight_decay_type == Decoupled) { - g += cfg.weight_decay * cfg.lr * p; + g += weight_decay * lr * p; } velocity[i] = v; @@ -61,7 +66,7 @@ __device__ void sgd_update( #define SGD(TYPENAME, FN) \ extern "C" __global__ void FN( \ - const SgdConfig cfg, \ + const SgdConfig cfg, \ const size_t numel, \ TYPENAME* param, \ TYPENAME* velocity, \ @@ -70,5 +75,6 @@ extern "C" __global__ void FN( \ sgd_update(cfg, numel, param, velocity, grad); \ } +SGD(__half, sgd_update_f16); SGD(float, sgd_update_f32); SGD(double, sgd_update_f64); diff --git a/src/shapes/shape.rs b/src/shapes/shape.rs index df77aeeca..2adaf92b2 100644 --- a/src/shapes/shape.rs +++ b/src/shapes/shape.rs @@ -47,6 +47,8 @@ unit!(i64, 1); unit!(u128, 1); unit!(i128, 1); unit!(bool, true); +#[cfg(feature = "f16")] +unit!(half::f16, half::f16::ONE); /// Represents something that has a [Unit]. pub trait HasUnitType { @@ -85,6 +87,8 @@ impl Dtype for u32 {} impl Dtype for u64 {} impl Dtype for u128 {} impl Dtype for usize {} +#[cfg(feature = "f16")] +impl Dtype for half::f16 {} /// Represents something that has a [Dtype]. pub trait HasDtype { diff --git a/src/tensor/mod.rs b/src/tensor/mod.rs index e9eb2f9f3..3ff661729 100644 --- a/src/tensor/mod.rs +++ b/src/tensor/mod.rs @@ -184,7 +184,7 @@ pub use gradients::{Gradients, Merge, NoneTape, OwnedTape, Tape}; mod tests { use super::*; use crate::shapes::*; - use crate::tests::{TestDevice, TestDtype}; + use crate::tests::*; use std::collections::HashSet; #[test] @@ -283,80 +283,64 @@ mod tests { #[test] fn test_upper_tri() { let dev: TestDevice = Default::default(); - let vl: TestDtype = 42.0; + let a: TestDtype = NumCast::from(42.0).unwrap(); + let z = TestDtype::zero(); - assert_eq!(dev.upper_tri::(vl, None).array(), vl); - assert_eq!(dev.upper_tri::(vl, 1).array(), 0.); + assert_eq!(dev.upper_tri::(a, None).array(), a); + assert_eq!(dev.upper_tri::(a, 1).array(), z); + assert_eq!(dev.upper_tri::>(a, None).array(), [a, a, a]); + assert_eq!(dev.upper_tri::>(a, 1).array(), [z, a, a]); - assert_eq!(dev.upper_tri::>(vl, None).array(), [vl, vl, vl]); - assert_eq!(dev.upper_tri::>(vl, 1).array(), [0., vl, vl]); - - assert_eq!( - dev.upper_tri::>(vl, None).array(), - [[vl, vl, vl, vl], [0., vl, vl, vl], [0., 0., vl, vl]] - ); - assert_eq!( - dev.upper_tri::>(vl, None).array(), - [[vl], [0.], [0.]] - ); assert_eq!( - dev.upper_tri::>(vl, 1).array(), - [[0.], [0.], [0.]] + dev.upper_tri::>(a, None).array(), + [[a, a, a, a], [z, a, a, a], [z, z, a, a]] ); assert_eq!( - dev.upper_tri::>(vl, -1).array(), - [[vl], [vl], [0.]] + dev.upper_tri::>(a, None).array(), + [[a], [z], [z]] ); + assert_eq!(dev.upper_tri::>(a, 1).array(), [[z], [z], [z]]); + assert_eq!(dev.upper_tri::>(a, -1).array(), [[a], [a], [z]]); assert_eq!( - dev.upper_tri::>(vl, -1).array(), - [ - [vl, vl, vl, vl], - [vl, vl, vl, vl], - [0., vl, vl, vl], - [0., 0., vl, vl] - ] + dev.upper_tri::>(a, -1).array(), + [[a, a, a, a], [a, a, a, a], [z, a, a, a], [z, z, a, a]] ); assert_eq!( - dev.upper_tri::>(vl, -2).array(), - [ - [vl, vl, vl, vl], - [vl, vl, vl, vl], - [vl, vl, vl, vl], - [0., vl, vl, vl] - ] + dev.upper_tri::>(a, -2).array(), + [[a, a, a, a], [a, a, a, a], [a, a, a, a], [z, a, a, a]] ); assert_eq!( - dev.upper_tri::>(vl, 1).array(), - [[0., vl, vl], [0., 0., vl], [0., 0., 0.], [0., 0., 0.]] + dev.upper_tri::>(a, 1).array(), + [[z, a, a], [z, z, a], [z, z, z], [z, z, z]] ); assert_eq!( - dev.upper_tri::>(vl, None).array(), + dev.upper_tri::>(a, None).array(), [[ - [vl, vl, vl, vl, vl], - [0., vl, vl, vl, vl], - [0., 0., vl, vl, vl], - [0., 0., 0., vl, vl], - [0., 0., 0., 0., vl] + [a, a, a, a, a], + [z, a, a, a, a], + [z, z, a, a, a], + [z, z, z, a, a], + [z, z, z, z, a] ]; 2] ); assert_eq!( - dev.upper_tri::>(vl, 2).array(), + dev.upper_tri::>(a, 2).array(), [[ - [0., 0., vl, vl, vl], - [0., 0., 0., vl, vl], - [0., 0., 0., 0., vl], - [0., 0., 0., 0., 0.], - [0., 0., 0., 0., 0.] + [z, z, a, a, a], + [z, z, z, a, a], + [z, z, z, z, a], + [z, z, z, z, z], + [z, z, z, z, z] ]; 4] ); assert_eq!( - dev.upper_tri::>(vl, None).array(), + dev.upper_tri::>(a, None).array(), [[[ - [vl, vl, vl, vl, vl, vl], - [0., vl, vl, vl, vl, vl], - [0., 0., vl, vl, vl, vl], - [0., 0., 0., vl, vl, vl], - [0., 0., 0., 0., vl, vl] + [a, a, a, a, a, a], + [z, a, a, a, a, a], + [z, z, a, a, a, a], + [z, z, z, a, a, a], + [z, z, z, z, a, a] ]; 4]; 3] ); } @@ -364,80 +348,64 @@ mod tests { #[test] fn test_lower_tri() { let dev: TestDevice = Default::default(); - let vl: TestDtype = 42.0; + let a: TestDtype = NumCast::from(42.0).unwrap(); + let z = TestDtype::zero(); - assert_eq!(dev.lower_tri::(vl, None).array(), vl); - assert_eq!(dev.lower_tri::(vl, -1).array(), 0.); + assert_eq!(dev.lower_tri::(a, None).array(), a); + assert_eq!(dev.lower_tri::(a, -1).array(), z); + assert_eq!(dev.lower_tri::>(a, None).array(), [a, z, z]); + assert_eq!(dev.lower_tri::>(a, 1).array(), [a, a, z]); - assert_eq!(dev.lower_tri::>(vl, None).array(), [vl, 0., 0.]); - assert_eq!(dev.lower_tri::>(vl, 1).array(), [vl, vl, 0.]); - - assert_eq!( - dev.lower_tri::>(vl, None).array(), - [[vl, 0., 0., 0.], [vl, vl, 0., 0.], [vl, vl, vl, 0.]] - ); - assert_eq!( - dev.lower_tri::>(vl, None).array(), - [[vl], [vl], [vl]] - ); assert_eq!( - dev.lower_tri::>(vl, 1).array(), - [[vl], [vl], [vl]] + dev.lower_tri::>(a, None).array(), + [[a, z, z, z], [a, a, z, z], [a, a, a, z]] ); assert_eq!( - dev.lower_tri::>(vl, -1).array(), - [[0.], [vl], [vl]] + dev.lower_tri::>(a, None).array(), + [[a], [a], [a]] ); + assert_eq!(dev.lower_tri::>(a, 1).array(), [[a], [a], [a]]); + assert_eq!(dev.lower_tri::>(a, -1).array(), [[z], [a], [a]]); assert_eq!( - dev.lower_tri::>(vl, -1).array(), - [ - [0., 0., 0., 0.], - [vl, 0., 0., 0.], - [vl, vl, 0., 0.], - [vl, vl, vl, 0.] - ] + dev.lower_tri::>(a, -1).array(), + [[z, z, z, z], [a, z, z, z], [a, a, z, z], [a, a, a, z]] ); assert_eq!( - dev.lower_tri::>(vl, -2).array(), - [ - [0., 0., 0., 0.], - [0., 0., 0., 0.], - [vl, 0., 0., 0.], - [vl, vl, 0., 0.] - ] + dev.lower_tri::>(a, -2).array(), + [[z, z, z, z], [z, z, z, z], [a, z, z, z], [a, a, z, z]] ); assert_eq!( - dev.lower_tri::>(vl, 1).array(), - [[vl, vl, 0.], [vl, vl, vl], [vl, vl, vl], [vl, vl, vl]] + dev.lower_tri::>(a, 1).array(), + [[a, a, z], [a, a, a], [a, a, a], [a, a, a]] ); assert_eq!( - dev.lower_tri::>(vl, None).array(), + dev.lower_tri::>(a, None).array(), [[ - [vl, 0., 0., 0., 0.], - [vl, vl, 0., 0., 0.], - [vl, vl, vl, 0., 0.], - [vl, vl, vl, vl, 0.], - [vl, vl, vl, vl, vl] + [a, z, z, z, z], + [a, a, z, z, z], + [a, a, a, z, z], + [a, a, a, a, z], + [a, a, a, a, a] ]; 2] ); assert_eq!( - dev.lower_tri::>(vl, 2).array(), + dev.lower_tri::>(a, 2).array(), [[ - [vl, vl, vl, 0., 0.], - [vl, vl, vl, vl, 0.], - [vl, vl, vl, vl, vl], - [vl, vl, vl, vl, vl], - [vl, vl, vl, vl, vl] + [a, a, a, z, z], + [a, a, a, a, z], + [a, a, a, a, a], + [a, a, a, a, a], + [a, a, a, a, a] ]; 4] ); assert_eq!( - dev.lower_tri::>(vl, None).array(), + dev.lower_tri::>(a, None).array(), [[[ - [vl, 0., 0., 0., 0., 0.], - [vl, vl, 0., 0., 0., 0.], - [vl, vl, vl, 0., 0., 0.], - [vl, vl, vl, vl, 0., 0.], - [vl, vl, vl, vl, vl, 0.] + [a, z, z, z, z, z], + [a, a, z, z, z, z], + [a, a, a, z, z, z], + [a, a, a, a, z, z], + [a, a, a, a, a, z] ]; 4]; 3] ); } diff --git a/src/tensor_ops/abs/abs.cu b/src/tensor_ops/abs/abs.cu index 045e8fa15..cc6773a6d 100644 --- a/src/tensor_ops/abs/abs.cu +++ b/src/tensor_ops/abs/abs.cu @@ -2,10 +2,14 @@ struct AbsKernelOp {}; +UNARY_OP(__half, abs_fwd_f16, abs_bwd_f16, AbsKernelOp, + absg(x), + x == __float2half(0.0) ? __float2half(0.0) : copysigng(__float2half(1.0), x)); + UNARY_OP(float, abs_fwd_f32, abs_bwd_f32, AbsKernelOp, - fabsf(x), - x == 0.0 ? 0.0 : copysignf(1.0, x)); + absg(x), + x == 0.0 ? 0.0f : copysigng(1.0f, x)); UNARY_OP(double, abs_fwd_f64, abs_bwd_f64, AbsKernelOp, - fabs(x), - x == 0.0 ? 0.0 : copysign(1.0, x)); + absg(x), + x == 0.0 ? 0.0 : copysigng(1.0, x)); diff --git a/src/tensor_ops/abs/cuda_kernel.rs b/src/tensor_ops/abs/cuda_kernel.rs index 275738ced..773da7c54 100644 --- a/src/tensor_ops/abs/cuda_kernel.rs +++ b/src/tensor_ops/abs/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for AbsKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/abs.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(AbsKernelOp, half::f16, PTX, "abs_fwd_f16", "abs_bwd_f16"); cuda_unary!(AbsKernelOp, f32, PTX, "abs_fwd_f32", "abs_bwd_f32"); cuda_unary!(AbsKernelOp, f64, PTX, "abs_fwd_f64", "abs_bwd_f64"); diff --git a/src/tensor_ops/abs/mod.rs b/src/tensor_ops/abs/mod.rs index 61ca7a3a7..6d4d663dd 100644 --- a/src/tensor_ops/abs/mod.rs +++ b/src/tensor_ops/abs/mod.rs @@ -46,7 +46,9 @@ mod tests { #[test] fn test_abs() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().abs(); assert_close_to_literal!(r, [2.0, 1.0, 0.0, 1.0, 2.0]); let g = r.mean().backward(); diff --git a/src/tensor_ops/add/binary_add.cu b/src/tensor_ops/add/binary_add.cu index de6ca68db..d749bc13b 100644 --- a/src/tensor_ops/add/binary_add.cu +++ b/src/tensor_ops/add/binary_add.cu @@ -2,6 +2,11 @@ struct BinaryAddOp {}; +BINARY_OP(__half, badd_fwd_f16, badd_bwd_lhs_f16, badd_bwd_rhs_f16, BinaryAddOp, + x + y, + 1.0, + 1.0) + BINARY_OP(float, badd_fwd_f32, badd_bwd_lhs_f32, badd_bwd_rhs_f32, BinaryAddOp, x + y, 1.0, diff --git a/src/tensor_ops/add/cuda_kernel.rs b/src/tensor_ops/add/cuda_kernel.rs index ce87ac1bd..a2b399643 100644 --- a/src/tensor_ops/add/cuda_kernel.rs +++ b/src/tensor_ops/add/cuda_kernel.rs @@ -1,6 +1,8 @@ use super::{BinaryAddKernelOp as Binary, ScalarAddKernelOp as Scalar}; use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary}; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Binary {} @@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {} const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_add.ptx")); const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_add.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(const_df() Scalar, half::f16, SCALAR_PTX, "sadd_fwd_f16", "sadd_bwd_f16"); cuda_unary!(const_df() Scalar, f32, SCALAR_PTX, "sadd_fwd_f32", "sadd_bwd_f32"); cuda_unary!(const_df() Scalar, f64, SCALAR_PTX, "sadd_fwd_f64", "sadd_bwd_f64"); +#[cfg(feature = "f16")] +cuda_binary!( + const_df() Binary, + half::f16, + BINARY_PTX, + "badd_fwd_f16", + "badd_bwd_lhs_f16", + "badd_bwd_rhs_f16" +); cuda_binary!( const_df() Binary, f32, diff --git a/src/tensor_ops/add/mod.rs b/src/tensor_ops/add/mod.rs index 402cf8952..0ab9dd5e6 100644 --- a/src/tensor_ops/add/mod.rs +++ b/src/tensor_ops/add/mod.rs @@ -74,6 +74,17 @@ impl, E>, T: Tape> } } +#[cfg(feature = "f16")] +impl, half::f16>, T: Tape> + TryAdd for Tensor +{ + /// See [add] + fn try_add(self, rhs: f32) -> Result { + let scalar = half::f16::from_f32(rhs); + try_unary_op(ScalarAddKernelOp { scalar }, self) + } +} + impl, Rhs> std::ops::Add for Tensor where @@ -93,8 +104,8 @@ mod tests { #[test] fn test_add_0d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor(1.0); - let b: Tensor<_, TestDtype, _> = dev.tensor(1.0); + let a = dev.tensor(1.0f64).to_dtype::(); + let b = dev.tensor(1.0f64).to_dtype::(); let r = a.leaky_trace() + b.clone(); assert_close_to_literal!(r, 2.0); @@ -106,8 +117,8 @@ mod tests { #[test] fn test_add_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); - let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]); + let a = dev.tensor([1.0f64, 2.0, 3.0]).to_dtype::(); + let b = dev.tensor([1.0f64, -1.0, 0.0]).to_dtype::(); let r = a.leaky_trace() + b.clone(); assert_close_to_literal!(r, [2.0, 1.0, 3.0]); @@ -119,10 +130,12 @@ mod tests { #[test] fn test_add_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]); + let a = dev + .tensor([[0.6570f64, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]) + .to_dtype::(); + let b = dev + .tensor([[0.5199f64, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]) + .to_dtype::(); let r = a.leaky_trace() + b.clone(); assert_close_to_literal!(r, [[1.1769, 0.5552, 0.5259], [1.3917, 1.0692, 0.873]]); @@ -134,10 +147,12 @@ mod tests { #[test] fn test_add_broadcast_bottom() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]); + let a = dev + .tensor([[0.6570f64, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]) + .to_dtype::(); + let b = dev + .tensor([[0.5199f64, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]) + .to_dtype::(); let a2 = a.broadcast::, _>(); let b2 = b.broadcast::, _>(); @@ -158,10 +173,12 @@ mod tests { #[test] fn test_add_broadcast_top() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]); + let a = dev + .tensor([[0.6570f64, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]) + .to_dtype::(); + let b = dev + .tensor([[0.5199f64, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]) + .to_dtype::(); let a2 = a.broadcast::, _>(); let b2 = b.broadcast::, _>(); @@ -176,7 +193,7 @@ mod tests { #[test] fn test_scalar_add_0d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor(0.0); + let x: Tensor<(), TestDtype, _> = dev.zeros(); let r = x.leaky_trace() + 1.0; assert_close_to_literal!(r, 1.0); let g = r.exp().backward(); @@ -186,7 +203,7 @@ mod tests { #[test] fn test_scalar_add_1d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]); + let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::(); let r = x.leaky_trace() + 0.5; assert_close_to_literal!(r, [0.5, 1.5, 2.5]); let g = r.exp().sum().backward(); @@ -196,7 +213,7 @@ mod tests { #[test] fn test_scalar_add_2d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[0.0; 2]; 3]); + let x = dev.tensor([[0.0; 2]; 3]).to_dtype::(); let r = x.leaky_trace() + 0.5; assert_close_to_literal!(r, [[0.5; 2]; 3]); let g = r.exp().sum().backward(); diff --git a/src/tensor_ops/add/scalar_add.cu b/src/tensor_ops/add/scalar_add.cu index d82a3c7a7..3a3c7d63d 100644 --- a/src/tensor_ops/add/scalar_add.cu +++ b/src/tensor_ops/add/scalar_add.cu @@ -5,6 +5,10 @@ struct ScalarAddKernelOp { F scalar; }; +UNARY_OP(__half, sadd_fwd_f16, sadd_bwd_f16, ScalarAddKernelOp<__half>, + x + op.scalar, + 1.0); + UNARY_OP(float, sadd_fwd_f32, sadd_bwd_f32, ScalarAddKernelOp, x + op.scalar, 1.0); diff --git a/src/tensor_ops/attention_reshape/attention_reshape.cu b/src/tensor_ops/attention_reshape/attention_reshape.cu index 763d3e142..27c51b5cd 100644 --- a/src/tensor_ops/attention_reshape/attention_reshape.cu +++ b/src/tensor_ops/attention_reshape/attention_reshape.cu @@ -64,6 +64,17 @@ __device__ void attention_reshape( } } +extern "C" __global__ void attention_reshape_f16( + const AttentionReshapeOp op, + const __half *qkv, + const __half *past_key, + const __half *past_value, + __half *query, + __half *key, + __half *value +) { + attention_reshape(op, qkv, past_key, past_value, query, key, value); +} extern "C" __global__ void attention_reshape_f32( const AttentionReshapeOp op, diff --git a/src/tensor_ops/attention_reshape/cuda_kernel.rs b/src/tensor_ops/attention_reshape/cuda_kernel.rs index a051cbd95..b4a7b37b1 100644 --- a/src/tensor_ops/attention_reshape/cuda_kernel.rs +++ b/src/tensor_ops/attention_reshape/cuda_kernel.rs @@ -19,6 +19,11 @@ trait HasCudaKernel { const FN: &'static str; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const FN: &'static str = "attention_reshape_f16"; +} + impl HasCudaKernel for Cuda { const FN: &'static str = "attention_reshape_f32"; } diff --git a/src/tensor_ops/attention_reshape/mod.rs b/src/tensor_ops/attention_reshape/mod.rs index ad600a1aa..7e26fcb70 100644 --- a/src/tensor_ops/attention_reshape/mod.rs +++ b/src/tensor_ops/attention_reshape/mod.rs @@ -89,7 +89,7 @@ impl> TryAttentionReshape for D { #[cfg(test)] mod tests { use super::*; - use crate::tests::*; + use crate::{tensor_ops::*, tests::*}; #[test] fn test_attention_reshape() { @@ -100,37 +100,57 @@ mod tests { let sequence_length = 1; let past_length = 3; - { - let qkv: Tensor<(usize, Const<{ NUM_HEADS * HEAD_DIM * 3 }>), TestDtype, _> = - dev.zeros_like(&(sequence_length, Const)) + 1.0; - let past_key: Tensor<(Const, Const, usize), TestDtype, _> = - dev.zeros_like(&(Const, Const, past_length)) + 2.0; - let past_value: Tensor<(Const, usize, Const), TestDtype, _> = - dev.zeros_like(&(Const, past_length, Const)) + 3.0; + let qkv: Tensor<(usize, Const<{ NUM_HEADS * HEAD_DIM * 3 }>), TestDtype, _> = + dev.zeros_like(&(sequence_length, Const)) + 1.0; + let past_key: Tensor<(Const, Const, usize), TestDtype, _> = + dev.zeros_like(&(Const, Const, past_length)) + 2.0; + let past_value: Tensor<(Const, usize, Const), TestDtype, _> = + dev.zeros_like(&(Const, past_length, Const)) + 3.0; - let (q, k, v) = dev.attention_reshape(&qkv, &past_key, &past_value); + let (q, k, v) = dev.attention_reshape(&qkv, &past_key, &past_value); - assert_eq!(q.as_vec(), std::vec![1.0; 6]); - #[rustfmt::skip] - assert_eq!( - k.as_vec(), - std::vec![ - 2.0, 2.0, 2.0, 1.0, - 2.0, 2.0, 2.0, 1.0, - 2.0, 2.0, 2.0, 1.0, - 2.0, 2.0, 2.0, 1.0, - 2.0, 2.0, 2.0, 1.0, - 2.0, 2.0, 2.0, 1.0 + let q = q + .realize::<(Const, Const<1>, Const)>() + .unwrap(); + let k = k + .realize::<(Const, Const, Const<4>)>() + .unwrap(); + let v = v + .realize::<(Const, Const<4>, Const)>() + .unwrap(); + + assert_close_to_literal!(q, [[[1.0; HEAD_DIM]; 1]; NUM_HEADS]); + assert_close_to_literal!( + k, + [ + [ + [2.0, 2.0, 2.0, 1.0], + [2.0, 2.0, 2.0, 1.0], + [2.0, 2.0, 2.0, 1.0] + ], + [ + [2.0, 2.0, 2.0, 1.0], + [2.0, 2.0, 2.0, 1.0], + [2.0, 2.0, 2.0, 1.0] + ] ] ); - #[rustfmt::skip] - assert_eq!( - v.as_vec(), - std::vec![ - 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 1.0, 1.0, - 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 1.0, 1.0 + assert_close_to_literal!( + v, + [ + [ + [3.0, 3.0, 3.0], + [3.0, 3.0, 3.0], + [3.0, 3.0, 3.0], + [1.0, 1.0, 1.0] + ], + [ + [3.0, 3.0, 3.0], + [3.0, 3.0, 3.0], + [3.0, 3.0, 3.0], + [1.0, 1.0, 1.0] + ] ] ); - } } } diff --git a/src/tensor_ops/axpy/axpy.cu b/src/tensor_ops/axpy/axpy.cu index 487541a97..9e6907757 100644 --- a/src/tensor_ops/axpy/axpy.cu +++ b/src/tensor_ops/axpy/axpy.cu @@ -1,3 +1,5 @@ +#include "cuda_fp16.h" + template __device__ void axpy(const size_t n, T* a, const T alpha, const T* b, const T beta) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -7,6 +9,10 @@ __device__ void axpy(const size_t n, T* a, const T alpha, const T* b, const T be a[i] = a[i] * alpha + b[i] * beta; } +extern "C" __global__ void axpy_f16(const size_t n, __half* a, const __half alpha, const __half* b, const __half beta) { + axpy(n, a, alpha, b, beta); +} + extern "C" __global__ void axpy_f32(const size_t n, float* a, const float alpha, const float* b, const float beta) { axpy(n, a, alpha, b, beta); } diff --git a/src/tensor_ops/axpy/cuda_kernel.rs b/src/tensor_ops/axpy/cuda_kernel.rs index 91b371ca8..57841a32d 100644 --- a/src/tensor_ops/axpy/cuda_kernel.rs +++ b/src/tensor_ops/axpy/cuda_kernel.rs @@ -10,6 +10,10 @@ const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/axpy.ptx")); trait HasCudaKernel { const FN: &'static str; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const FN: &'static str = "axpy_f16"; +} impl HasCudaKernel for Cuda { const FN: &'static str = "axpy_f32"; } diff --git a/src/tensor_ops/axpy/mod.rs b/src/tensor_ops/axpy/mod.rs index 8299eb2ae..b78c3ea45 100644 --- a/src/tensor_ops/axpy/mod.rs +++ b/src/tensor_ops/axpy/mod.rs @@ -1,5 +1,5 @@ use crate::{ - shapes::{Shape, Unit}, + shapes::{Dtype, Shape}, tensor::{DeviceStorage, Tensor}, }; @@ -10,11 +10,11 @@ mod cuda_kernel; /// Elementwise `a * alpha + b * beta`. /// /// See [Tensor::axpy] for in place version. -pub fn axpy( +pub fn axpy( a: &Tensor, - alpha: impl Into, + alpha: impl Into, b: &Tensor, - beta: impl Into, + beta: impl Into, ) -> Tensor where D: AxpyKernel, @@ -24,31 +24,31 @@ where dst } -impl> Tensor { +impl> Tensor { /// Updates self with elementwise function `self = self * alpha + b * beta`. - pub fn axpy(&mut self, alpha: impl Into, b: &Tensor, beta: impl Into) { + pub fn axpy(&mut self, alpha: impl Into, b: &Tensor, beta: impl Into) { self.try_axpy(alpha, b, beta).unwrap() } /// Updates self with elementwise function `self = self * alpha + b * beta`. pub fn try_axpy( &mut self, - alpha: impl Into, + alpha: impl Into, b: &Tensor, - beta: impl Into, + beta: impl Into, ) -> Result<(), D::Err> { assert_eq!(self.shape, b.shape); assert_eq!(self.strides, b.strides, "Strides must be equal for axpy"); self.device.clone().forward( std::sync::Arc::make_mut(&mut self.data), - alpha.into(), + E::from_f64(alpha.into()).unwrap(), b.data.as_ref(), - beta.into(), + E::from_f64(beta.into()).unwrap(), ) } } -pub trait AxpyKernel: DeviceStorage { +pub trait AxpyKernel: DeviceStorage { fn forward( &self, a: &mut Self::Vec, @@ -85,8 +85,10 @@ mod tests { fn test_axpy() { let dev: TestDevice = Default::default(); - let mut a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]; 2]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[-1.5; 5], [1.5; 5]]); + let mut a = dev + .tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]; 2]) + .to_dtype::(); + let b = dev.tensor([[-1.5; 5], [1.5; 5]]).to_dtype::(); a.axpy(0.01, &b, 0.99); diff --git a/src/tensor_ops/bce/bce.cu b/src/tensor_ops/bce/bce.cu index 081f13249..dc1d7de3d 100644 --- a/src/tensor_ops/bce/bce.cu +++ b/src/tensor_ops/bce/bce.cu @@ -5,12 +5,14 @@ struct BCEKernelOp {}; template __device__ T op_f(T logit, T prob) { T zero = 0.0; - return maxg(logit, zero) - logit * prob + logg(1.0 + expg(-absg(logit))); + T one = 1.0; + return maxg(logit, zero) - logit * prob + logg(one + expg(-absg(logit))); } template __device__ T op_dfdx(T logit, T prob) { - return 1.0 - prob - 1 / (1.0 + expg(logit)); + T one = 1.0; + return one - prob - one / (one + expg(logit)); } template @@ -18,6 +20,12 @@ __device__ T op_dfdy(T logit, T prob) { return -logit; } +BINARY_OP(__half, bce_fwd_f16, bce_bwd_lhs_f16, bce_bwd_rhs_f16, BCEKernelOp, + __float2half(op_f(__half2float(x), __half2float(y))), + op_dfdx(x, y), + op_dfdy(x, y) +) + BINARY_OP(float, bce_fwd_f32, bce_bwd_lhs_f32, bce_bwd_rhs_f32, BCEKernelOp, op_f(x, y), op_dfdx(x, y), diff --git a/src/tensor_ops/bce/cuda_kernel.rs b/src/tensor_ops/bce/cuda_kernel.rs index 35c091de9..f55d2ac43 100644 --- a/src/tensor_ops/bce/cuda_kernel.rs +++ b/src/tensor_ops/bce/cuda_kernel.rs @@ -5,6 +5,15 @@ unsafe impl cudarc::driver::DeviceRepr for BCEKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/bce.ptx")); +#[cfg(feature = "f16")] +cuda_binary!( + BCEKernelOp, + half::f16, + PTX, + "bce_fwd_f16", + "bce_bwd_lhs_f16", + "bce_bwd_rhs_f16" +); cuda_binary!( BCEKernelOp, f32, diff --git a/src/tensor_ops/bce/mod.rs b/src/tensor_ops/bce/mod.rs index 545cd2798..3053f045f 100644 --- a/src/tensor_ops/bce/mod.rs +++ b/src/tensor_ops/bce/mod.rs @@ -61,14 +61,18 @@ mod tests { #[test] fn test_bce() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([ - [-0.8424031, 0.6309481, 1.0416432], - [1.325225, 0.5840275, 1.9167633], - ]); - let b: Tensor<_, TestDtype, _> = dev.tensor([ - [0.52022195, 0.578804, 0.17535722], - [0.75429636, 0.66566986, 0.6182751], - ]); + let a = dev + .tensor([ + [-0.8424031, 0.6309481, 1.0416432], + [1.325225, 0.5840275, 1.9167633], + ]) + .to_dtype::(); + let b = dev + .tensor([ + [0.52022195, 0.578804, 0.17535722], + [0.75429636, 0.66566986, 0.6182751], + ]) + .to_dtype::(); let r = a.leaky_trace().bce_with_logits(b); assert_close_to_literal!( r, diff --git a/src/tensor_ops/choose/choose.cu b/src/tensor_ops/choose/choose.cu index d2e5a6187..799af246f 100644 --- a/src/tensor_ops/choose/choose.cu +++ b/src/tensor_ops/choose/choose.cu @@ -83,5 +83,6 @@ extern "C" __global__ void BWD( \ choose_bwd(numel, num_dims, dims, cond, cond_strides, grad_lhs, lhs_strides, grad_rhs, rhs_strides, grad_out); \ } +CHOOSE(__half, choose_fwd_f16, choose_bwd_f16); CHOOSE(float, choose_fwd_f32, choose_bwd_f32); CHOOSE(double, choose_fwd_f64, choose_bwd_f64); diff --git a/src/tensor_ops/choose/cuda_kernel.rs b/src/tensor_ops/choose/cuda_kernel.rs index d9f8f8f74..6a3c12794 100644 --- a/src/tensor_ops/choose/cuda_kernel.rs +++ b/src/tensor_ops/choose/cuda_kernel.rs @@ -11,6 +11,12 @@ pub(crate) trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "choose_f16"; + const FNS: &'static [&'static str] = &["choose_fwd_f16", "choose_bwd_f16"]; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "choose_f32"; const FNS: &'static [&'static str] = &["choose_fwd_f32", "choose_bwd_f32"]; diff --git a/src/tensor_ops/clamp/clamp.cu b/src/tensor_ops/clamp/clamp.cu index d61e09aa5..f5f90b8e3 100644 --- a/src/tensor_ops/clamp/clamp.cu +++ b/src/tensor_ops/clamp/clamp.cu @@ -6,11 +6,15 @@ struct ClampKernelOp { F max; }; +UNARY_OP(__half, clamp_fwd_f16, clamp_bwd_f16, ClampKernelOp<__half>, + maxg(ming(x, op.max), op.min), + x <= op.max && x >= op.min ? 1.0 : 0.0) + UNARY_OP(float, clamp_fwd_f32, clamp_bwd_f32, ClampKernelOp, - fmaxf(fminf(x, op.max), op.min), + maxg(ming(x, op.max), op.min), x <= op.max && x >= op.min ? 1.0 : 0.0) UNARY_OP(double, clamp_fwd_f64, clamp_bwd_f64, ClampKernelOp, - fmax(fmin(x, op.max), op.min), + maxg(ming(x, op.max), op.min), x <= op.max && x >= op.min ? 1.0 : 0.0) \ No newline at end of file diff --git a/src/tensor_ops/clamp/cuda_kernel.rs b/src/tensor_ops/clamp/cuda_kernel.rs index 890014fef..c8abf5248 100644 --- a/src/tensor_ops/clamp/cuda_kernel.rs +++ b/src/tensor_ops/clamp/cuda_kernel.rs @@ -1,10 +1,20 @@ use super::ClampKernelOp; use crate::tensor_ops::cuda_kernels::cuda_unary; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for ClampKernelOp {} unsafe impl cudarc::driver::DeviceRepr for ClampKernelOp {} unsafe impl cudarc::driver::DeviceRepr for ClampKernelOp {} const P: &str = include_str!(concat!(env!("OUT_DIR"), "/clamp.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + ClampKernelOp, + half::f16, + P, + "clamp_fwd_f16", + "clamp_bwd_f16" +); cuda_unary!(ClampKernelOp, f32, P, "clamp_fwd_f32", "clamp_bwd_f32"); cuda_unary!(ClampKernelOp, f64, P, "clamp_fwd_f64", "clamp_bwd_f64"); diff --git a/src/tensor_ops/clamp/mod.rs b/src/tensor_ops/clamp/mod.rs index ae7901fc4..b58b624aa 100644 --- a/src/tensor_ops/clamp/mod.rs +++ b/src/tensor_ops/clamp/mod.rs @@ -25,23 +25,23 @@ pub struct ClampKernelOp { /// ``` pub fn clamp, E>, T: Tape>( t: Tensor, - min: impl Into, - max: impl Into, + min: impl Into, + max: impl Into, ) -> Tensor { t.clamp(min, max) } impl, E>, T: Tape> Tensor { /// See [clamp] - pub fn clamp(self, min: impl Into, max: impl Into) -> Self { + pub fn clamp(self, min: impl Into, max: impl Into) -> Self { self.try_clamp(min, max).unwrap() } /// See [clamp] - pub fn try_clamp(self, min: impl Into, max: impl Into) -> Result { + pub fn try_clamp(self, min: impl Into, max: impl Into) -> Result { try_unary_op( ClampKernelOp { - min: min.into(), - max: max.into(), + min: E::from_f64(min.into()).unwrap(), + max: E::from_f64(max.into()).unwrap(), }, self, ) @@ -55,7 +55,9 @@ mod tests { #[test] fn test_clamp() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[-1.0, 0.0, 1.0], [-2.0, 2.0, 1.1]]); + let t = dev + .tensor([[-1.0, 0.0, 1.0], [-2.0, 2.0, 1.1]]) + .to_dtype::(); let r = t.leaky_trace().clamp(-1.0, 1.0); assert_close_to_literal!(r, [[-1.0, 0.0, 1.0], [-1.0, 1.0, 1.0]]); let g = r.exp().mean().backward(); diff --git a/src/tensor_ops/cmp/cmp.cu b/src/tensor_ops/cmp/cmp.cu index feff7e18f..bceeff44a 100644 --- a/src/tensor_ops/cmp/cmp.cu +++ b/src/tensor_ops/cmp/cmp.cu @@ -41,12 +41,20 @@ extern "C" __global__ void SCALAR_FWD( \ out[out_i] = lhs[lhs_i] SYMBOL scalar; \ } +CMP_OP(__half, eq_fwd_f16, scalar_eq_fwd_f16, ==) +CMP_OP(__half, ne_fwd_f16, scalar_ne_fwd_f16, !=) +CMP_OP(__half, gt_fwd_f16, scalar_gt_fwd_f16, >) +CMP_OP(__half, ge_fwd_f16, scalar_ge_fwd_f16, >=) +CMP_OP(__half, lt_fwd_f16, scalar_lt_fwd_f16, <) +CMP_OP(__half, le_fwd_f16, scalar_le_fwd_f16, <=) + CMP_OP(float, eq_fwd_f32, scalar_eq_fwd_f32, ==) CMP_OP(float, ne_fwd_f32, scalar_ne_fwd_f32, !=) CMP_OP(float, gt_fwd_f32, scalar_gt_fwd_f32, >) CMP_OP(float, ge_fwd_f32, scalar_ge_fwd_f32, >=) CMP_OP(float, lt_fwd_f32, scalar_lt_fwd_f32, <) CMP_OP(float, le_fwd_f32, scalar_le_fwd_f32, <=) + CMP_OP(double, eq_fwd_f64, scalar_eq_fwd_f64, ==) CMP_OP(double, ne_fwd_f64, scalar_ne_fwd_f64, !=) CMP_OP(double, gt_fwd_f64, scalar_gt_fwd_f64, >) diff --git a/src/tensor_ops/cmp/cuda_kernels.rs b/src/tensor_ops/cmp/cuda_kernels.rs index a9577be81..8b31b21c8 100644 --- a/src/tensor_ops/cmp/cuda_kernels.rs +++ b/src/tensor_ops/cmp/cuda_kernels.rs @@ -128,6 +128,19 @@ macro_rules! cmps { }; } +#[cfg(feature = "f16")] +cmps!(EqKernelOp, half::f16, "eq_fwd_f16", "scalar_eq_fwd_f16"); +#[cfg(feature = "f16")] +cmps!(NeKernelOp, half::f16, "ne_fwd_f16", "scalar_ne_fwd_f16"); +#[cfg(feature = "f16")] +cmps!(GtKernelOp, half::f16, "gt_fwd_f16", "scalar_gt_fwd_f16"); +#[cfg(feature = "f16")] +cmps!(GeKernelOp, half::f16, "ge_fwd_f16", "scalar_ge_fwd_f16"); +#[cfg(feature = "f16")] +cmps!(LtKernelOp, half::f16, "lt_fwd_f16", "scalar_lt_fwd_f16"); +#[cfg(feature = "f16")] +cmps!(LeKernelOp, half::f16, "le_fwd_f16", "scalar_le_fwd_f16"); + cmps!(EqKernelOp, f32, "eq_fwd_f32", "scalar_eq_fwd_f32"); cmps!(NeKernelOp, f32, "ne_fwd_f32", "scalar_ne_fwd_f32"); cmps!(GtKernelOp, f32, "gt_fwd_f32", "scalar_gt_fwd_f32"); diff --git a/src/tensor_ops/cmp/mod.rs b/src/tensor_ops/cmp/mod.rs index 1ebbec88b..e9f1bbc2f 100644 --- a/src/tensor_ops/cmp/mod.rs +++ b/src/tensor_ops/cmp/mod.rs @@ -228,6 +228,17 @@ macro_rules! impl_cmp_kernel_op { } } + #[cfg(feature = "f16")] + impl, T: Tape> + $TraitName for Tensor + { + type Output = Tensor; + #[doc = $doc] + fn $TryFnName(&self, other: f32) -> Result { + try_scalar_cmp_op(self, half::f16::from_f32(other)) + } + } + impl, T: Tape> Tensor { @@ -307,212 +318,192 @@ impl_cmp_kernel_op!( #[cfg(test)] mod tests { use super::*; - use crate::{shapes::*, tensor::*, tests::*}; - - type TestTensor = - Tensor<(Const, Const), E, TestDevice>; - - fn test_cmp( - a: [[E; C]; R], - b: [[E; C]; R], - cmp: F, - expected: [[bool; C]; R], - ) where - F: Fn(&TestTensor, &TestTensor) -> [[bool; C]; R], - { - let dev: TestDevice = Default::default(); - let a = dev.tensor(a); - let b = dev.tensor(b); - let r = cmp(&a, &b); - assert_eq!(r, expected); - } - - fn test_scalar_cmp( - a: [[E; C]; R], - cmp: F, - expected: [[bool; C]; R], - ) where - F: Fn(&TestTensor) -> [[bool; C]; R], - { - let dev: TestDevice = Default::default(); - let a = dev.tensor(a); - assert_eq!(cmp(&a), expected); - } + use crate::{tensor::*, tests::*}; #[test] fn test_eq() { let dev: TestDevice = Default::default(); let a = dev - .tensor([[1.0, 2.0, 0.0], [4.0, 5.0, 0.0]]) + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]]) .to_dtype::(); + let r = a.eq(&b); + assert_eq!(r.array(), [[false, true, false], [true, false, true]]); + #[cfg(not(feature = "cuda"))] { - let b = dev - .tensor([[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]]) - .to_dtype::(); + let a = dev.tensor([[1, 2, 3], [0, 123, 5]]); + let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]); let r = a.eq(&b); - assert_eq!(r.array(), [[false, true, false], [true, false, true]]); - } - - { - let r = a.eq(0.0); - assert_eq!(r.array(), [[false, false, true], [false, false, true]]); + assert_eq!(r.array(), [[false, true, false], [false, true, false]]); } } #[test] - fn test_ne() { - test_cmp::( - [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]], - [[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]], - |a, b| a.ne(b).array(), - [[true, false, true], [false, true, false]], - ); + fn test_scalar_eq() { + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[0.0, 1.2], [3.4, -5.6]]) + .to_dtype::(); + let r = a.eq(1.2); + assert_eq!(r.array(), [[false, true], [false, false]]); } - // TODO Remove this attribute once Cuda supports integers - #[cfg(not(feature = "cuda"))] #[test] - fn test_ne_not_dtype() { - test_cmp( - [[1, 2, 3], [0, 123, 5]], - [[0, 2, -3], [-4, 123, 6]], - |a, b| a.ne(b).array(), - [[true, false, true], [true, false, true]], - ); + fn test_ne() { + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 2.0, -3.0], [4.0, 0.5, -0.0]]) + .to_dtype::(); + let r = a.ne(&b); + assert_eq!(r.array(), [[true, false, true], [false, true, false]]); + + #[cfg(not(feature = "cuda"))] + { + let a = dev.tensor([[1, 2, 3], [0, 123, 5]]); + let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]); + let r = a.ne(&b); + assert_eq!(r.array(), [[true, false, true], [true, false, true]]); + } } #[test] fn test_scalar_ne() { - test_scalar_cmp::( - [[0.0, 1.2], [3.4, -5.6]], - |a| a.ne(1.2).array(), - [[true, false], [true, true]], - ); + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[0.0, 1.2], [3.4, -5.6]]) + .to_dtype::(); + let r = a.ne(1.2); + assert_eq!(r.array(), [[true, false], [true, true]]); } #[test] fn test_gt() { - test_cmp::( - [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]], - [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]], - |a, b| a.gt(b).array(), - [[true, false, false], [true, true, false]], - ); - } + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]]) + .to_dtype::(); + let r = a.gt(&b); + assert_eq!(r.array(), [[true, false, false], [true, true, false]]); - // TODO Remove this attribute once Cuda supports integers - #[cfg(not(feature = "cuda"))] - #[test] - fn test_gt_not_dtype() { - test_cmp( - [[1, 2, 3], [0, 123, 5]], - [[0, 2, -3], [-4, 123, 6]], - |a, b| a.gt(b).array(), - [[true, false, true], [true, false, false]], - ); + #[cfg(not(feature = "cuda"))] + { + let a = dev.tensor([[1, 2, 3], [0, 123, 5]]); + let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]); + let r = a.gt(&b); + assert_eq!(r.array(), [[true, false, true], [true, false, false]]); + } } #[test] fn test_scalar_gt() { - test_scalar_cmp::( - [[0.0, 1.2], [3.4, -5.6]], - |a| a.gt(1.2).array(), - [[false, false], [true, false]], - ); + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[0.0, 1.2], [3.4, -5.6]]) + .to_dtype::(); + let r = a.gt(1.2); + assert_eq!(r.array(), [[false, false], [true, false]]); } #[test] fn test_ge() { - test_cmp::( - [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]], - [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]], - |a, b| a.ge(b).array(), - [[true, true, false], [true, true, true]], - ); - } + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]]) + .to_dtype::(); + let r = a.ge(&b); + assert_eq!(r.array(), [[true, true, false], [true, true, true]]); - // TODO Remove this attribute once Cuda supports integers - #[cfg(not(feature = "cuda"))] - #[test] - fn test_ge_not_dtype() { - test_cmp( - [[1, 2, 3], [0, 123, 5]], - [[0, 2, -3], [-4, 123, 6]], - |a, b| a.ge(b).array(), - [[true, true, true], [true, true, false]], - ); + #[cfg(not(feature = "cuda"))] + { + let a = dev.tensor([[1, 2, 3], [0, 123, 5]]); + let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]); + let r = a.ge(&b); + assert_eq!(r.array(), [[true, true, true], [true, true, false]]); + } } #[test] fn test_scalar_ge() { - test_scalar_cmp::( - [[0.0, 1.2], [3.4, -5.6]], - |a| a.ge(1.2).array(), - [[false, true], [true, false]], - ); + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[0.0, 1.2], [3.4, -5.6]]) + .to_dtype::(); + let r = a.ge(1.2); + assert_eq!(r.array(), [[false, true], [true, false]]); } #[test] fn test_lt() { - test_cmp::( - [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]], - [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]], - |a, b| a.lt(b).array(), - [[false, false, true], [false, false, false]], - ); - } + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]]) + .to_dtype::(); + let r = a.lt(&b); + assert_eq!(r.array(), [[false, false, true], [false, false, false]]); - // TODO Remove this attribute once Cuda supports integers - #[cfg(not(feature = "cuda"))] - #[test] - fn test_lt_not_dtype() { - test_cmp( - [[1, 2, 3], [0, 123, 5]], - [[0, 2, -3], [-4, 123, 6]], - |a, b| a.lt(b).array(), - [[false, false, false], [false, false, true]], - ); + #[cfg(not(feature = "cuda"))] + { + let a = dev.tensor([[1, 2, 3], [0, 123, 5]]); + let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]); + let r = a.lt(&b); + assert_eq!(r.array(), [[false, false, false], [false, false, true]]); + } } #[test] fn test_scalar_lt() { - test_scalar_cmp::( - [[0.0, 1.2], [3.4, -5.6]], - |a| a.lt(1.2).array(), - [[true, false], [false, true]], - ); + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[0.0, 1.2], [3.4, -5.6]]) + .to_dtype::(); + let r = a.lt(1.2); + assert_eq!(r.array(), [[true, false], [false, true]]); } #[test] fn test_le() { - test_cmp::( - [[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]], - [[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]], - |a, b| a.le(b).array(), - [[false, true, true], [false, false, true]], - ); - } + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 0.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 2.0, 3.1], [-4.0, -5.5, -0.0]]) + .to_dtype::(); + let r = a.le(&b); + assert_eq!(r.array(), [[false, true, true], [false, false, true]]); - // TODO Remove this attribute once Cuda supports integers - #[cfg(not(feature = "cuda"))] - #[test] - fn test_le_not_dtype() { - test_cmp( - [[1, 2, 3], [0, 123, 5]], - [[0, 2, -3], [-4, 123, 6]], - |a, b| a.le(b).array(), - [[false, true, false], [false, true, true]], - ); + #[cfg(not(feature = "cuda"))] + { + let a = dev.tensor([[1, 2, 3], [0, 123, 5]]); + let b = dev.tensor([[0, 2, -3], [-4, 123, 6]]); + let r = a.le(&b); + assert_eq!(r.array(), [[false, true, false], [false, true, true]]); + } } #[test] fn test_scalar_le() { - test_scalar_cmp::( - [[0.0, 1.2], [3.4, -5.6]], - |a| a.le(1.2).array(), - [[true, true], [false, true]], - ); + let dev: TestDevice = Default::default(); + let a = dev + .tensor([[0.0, 1.2], [3.4, -5.6]]) + .to_dtype::(); + let r = a.le(1.2); + assert_eq!(r.array(), [[true, true], [false, true]]); } #[test] diff --git a/src/tensor_ops/concat/cuda_kernel.rs b/src/tensor_ops/concat/cuda_kernel.rs index 2dea40dcc..3a278df6c 100644 --- a/src/tensor_ops/concat/cuda_kernel.rs +++ b/src/tensor_ops/concat/cuda_kernel.rs @@ -39,6 +39,7 @@ impl super::ConcatKernel for Cuda { let src = BWD_KERNEL.replace("$Ty", E::NAME); let opts = CompileOptions { arch: Some(env!("CUDA_COMPUTE_CAP")), + include_paths: vec![env!("CUDA_INCLUDE_DIR").to_string()], ..Default::default() }; let ptx = compile_ptx_with_opts(src, opts).unwrap(); @@ -64,6 +65,7 @@ impl super::ConcatKernel for Cuda { } const BWD_KERNEL: &str = " +#include \"cuda_fp16.h\" extern \"C\" __global__ void concat_bwd(const size_t numel, const $Ty *inp, $Ty *out) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < numel) { out[i] += inp[i]; } diff --git a/src/tensor_ops/conv2d/conv2d.cu b/src/tensor_ops/conv2d/conv2d.cu index ed48dbc46..16e2ce9dc 100644 --- a/src/tensor_ops/conv2d/conv2d.cu +++ b/src/tensor_ops/conv2d/conv2d.cu @@ -1,3 +1,5 @@ +#include "cuda_fp16.h" + struct Conv2DOp { size_t stride; size_t padding; @@ -37,11 +39,13 @@ __device__ void unfold_input_into_patches( patches += c * (op.kernel * op.kernel * op.h_out * op.w_out); patches += b * (op.chan_in * op.kernel * op.kernel * op.h_out * op.w_out); + T zero = 0.0; + for (int k1 = 0;k1 < op.kernel;k1++) { const size_t y = oh * op.stride + k1 - op.padding; for (int k2 = 0;k2 < op.kernel;k2++) { const size_t x = ow * op.stride + k2 - op.padding; - *patches = (y >= op.h_in || x >= op.w_in) ? 0.0 : image[y * strides[2] + x * strides[3]]; + *patches = (y >= op.h_in || x >= op.w_in) ? zero : image[y * strides[2] + x * strides[3]]; patches += op.h_out * op.w_out; } } @@ -72,6 +76,8 @@ __device__ void unfold_output_into_patches( patches += o * (op.kernel * op.kernel * op.h_in * op.w_in); patches += b * (op.chan_out * op.kernel * op.kernel * op.h_in * op.w_in); + T zero = 0.0; + for (int k1 = 0;k1 < op.kernel;k1++) { const size_t oh_ks = y + op.padding; const size_t oh_s = oh_ks - k1; @@ -83,7 +89,7 @@ __device__ void unfold_output_into_patches( const size_t ow = ow_s / op.stride; const bool invalid = k1_invalid || (ow_ks < k2 || ow_s % op.stride != 0 || ow >= op.w_out); - *patches = invalid ? 0.0 : image_out[oh * op.w_out + ow]; + *patches = invalid ? zero : image_out[oh * op.w_out + ow]; patches += op.h_in * op.w_in; } } @@ -185,6 +191,13 @@ extern "C" __global__ void SUM_TR_FILTERS( \ sum_transposed_filters(op, filters_tr, filters, strides); \ } +CONV_OP( + __half, + unfold_input_into_patches_f16, + unfold_output_into_patches_f16, + transpose_filters_f16, + sum_transposed_filters_f16 +); CONV_OP( float, unfold_input_into_patches_f32, diff --git a/src/tensor_ops/conv2d/cuda_kernel.rs b/src/tensor_ops/conv2d/cuda_kernel.rs index d26ff0fef..d773b3e5e 100644 --- a/src/tensor_ops/conv2d/cuda_kernel.rs +++ b/src/tensor_ops/conv2d/cuda_kernel.rs @@ -17,6 +17,17 @@ trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "conv2d_f16"; + const FNS: &'static [&'static str] = &[ + "unfold_input_into_patches_f16", + "unfold_output_into_patches_f16", + "transpose_filters_f16", + "sum_transposed_filters_f16", + ]; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "conv2d_f32"; const FNS: &'static [&'static str] = &[ diff --git a/src/tensor_ops/conv2d/cudnn_kernel.rs b/src/tensor_ops/conv2d/cudnn_kernel.rs index 4af2e8309..1f2792eca 100644 --- a/src/tensor_ops/conv2d/cudnn_kernel.rs +++ b/src/tensor_ops/conv2d/cudnn_kernel.rs @@ -9,6 +9,8 @@ use crate::{ use std::sync::Arc; trait HasCudnnKernel {} +#[cfg(feature = "f16")] +impl HasCudnnKernel for Cuda {} impl HasCudnnKernel for Cuda {} impl HasCudnnKernel for Cuda {} diff --git a/src/tensor_ops/conv2d/mod.rs b/src/tensor_ops/conv2d/mod.rs index dfcb0b434..593eccb52 100644 --- a/src/tensor_ops/conv2d/mod.rs +++ b/src/tensor_ops/conv2d/mod.rs @@ -239,15 +239,21 @@ mod tests { /// ``` fn test_conv2d_default_stride_and_padding() { let dev: TestDevice = Default::default(); - let weight: Tensor<_, TestDtype, _> = dev.tensor([ - [[[-0.04958433, -0.43007267], [0.01935136, 0.09778714]]], - [[[0.44083858, -0.20507240], [-0.30017477, -0.10937047]]], - ]); - let bias: Tensor<_, TestDtype, _> = dev.tensor([0.36406237, -0.30981010]); - let x: Tensor<_, TestDtype, _> = dev.tensor([[ - [-0.86713916, 0.52773184, -0.95238322], - [-0.64531374, 0.77809018, -0.49099201], - ]]); + let weight = dev + .tensor([ + [[[-0.04958433, -0.43007267], [0.01935136, 0.09778714]]], + [[[0.44083858, -0.20507240], [-0.30017477, -0.10937047]]], + ]) + .to_dtype::(); + let bias = dev + .tensor([0.36406237, -0.30981010]) + .to_dtype::(); + let x = dev + .tensor([[ + [-0.86713916, 0.52773184, -0.95238322], + [-0.64531374, 0.77809018, -0.49099201], + ]]) + .to_dtype::(); let result = x.leaky_trace().conv2d::<1, 0>(weight.clone()) + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>(); assert_close_to_literal!( @@ -281,15 +287,21 @@ mod tests { /// ``` fn test_conv2d_stride_2() { let dev: TestDevice = Default::default(); - let weight: Tensor<_, TestDtype, _> = dev.tensor([ - [[[0.44704646, -0.29563826], [0.29228759, -0.16575140]]], - [[[-0.30488998, 0.25222939], [0.13279295, 0.38153177]]], - ]); - let bias: Tensor<_, TestDtype, _> = dev.tensor([-0.44699109, 0.38371694]); - let x: Tensor<_, TestDtype, _> = dev.tensor([[ - [0.37100124, -0.59504986, -1.19781005], - [-0.31547278, 0.58071911, 0.86612970], - ]]); + let weight = dev + .tensor([ + [[[0.44704646, -0.29563826], [0.29228759, -0.16575140]]], + [[[-0.30488998, 0.25222939], [0.13279295, 0.38153177]]], + ]) + .to_dtype::(); + let bias = dev + .tensor([-0.44699109, 0.38371694]) + .to_dtype::(); + let x = dev + .tensor([[ + [0.37100124, -0.59504986, -1.19781005], + [-0.31547278, 0.58071911, 0.86612970], + ]]) + .to_dtype::(); let result = x.leaky_trace().conv2d::<2, 0>(weight.clone()) + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>(); @@ -317,14 +329,17 @@ mod tests { fn test_conv2d_padding_1() { let dev: TestDevice = Default::default(); #[rustfmt::skip] - let weight: Tensor<_, TestDtype, _> = dev.tensor([ + let weight = dev.tensor([ [[[0.10215953, 0.06263646], [-0.04124039, -0.09729567]], [[-0.32656857, 0.24254093], [-0.27209827, 0.15361503]]], [[[0.03449896, 0.22931078], [-0.17652659, 0.08222872]],[[-0.06016779, 0.29082409], [-0.19154115, 0.13483226]]], [[[-0.14262493, 0.19654515], [0.15921101, 0.01759464]],[[0.16749159, 0.33096817], [0.28376505, -0.05524009]]], - ]); - let bias: Tensor<_, TestDtype, _> = dev.tensor([-0.22854491, 0.28763595, 0.20709404]); - let x: Tensor<_, TestDtype, _> = - dev.tensor([[[-0.32224107, -0.32800716]], [[-1.13570976, 0.93713200]]]); + ]).to_dtype::(); + let bias = dev + .tensor([-0.22854491, 0.28763595, 0.20709404]) + .to_dtype::(); + let x = dev + .tensor([[[-0.32224107, -0.32800716]], [[-1.13570976, 0.93713200]]]) + .to_dtype::(); let result = x.leaky_trace().conv2d::<1, 1>(weight.clone()) + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>(); @@ -363,13 +378,15 @@ mod tests { fn test_conv2d_stride_3_padding_4() { let dev: TestDevice = Default::default(); #[rustfmt::skip] - let weight: Tensor<_, TestDtype, _> = dev.tensor([ + let weight = dev.tensor([ [[[-0.10252278, -0.14387409, -0.14627469],[0.28396228, -0.14590892, 0.29269591],[0.01090384, 0.14785287, 0.29242596]]], [[[-0.31163597, 0.13224581, -0.20954299],[0.27902845, -0.14735751, 0.14001134],[-0.05224654, 0.16499066, -0.13981307]]], - ]); - let bias: Tensor<_, TestDtype, _> = dev.tensor([-0.07123789, -0.17244765]); + ]).to_dtype::(); + let bias = dev + .tensor([-0.07123789, -0.17244765]) + .to_dtype::(); #[rustfmt::skip] - let x: Tensor<_, TestDtype, _> = dev.tensor([[[0.69103152, 0.25624934],[-0.38448590, 0.03110456],[0.83753252, 0.53786588],[1.15540242, -0.54148245]]]); + let x = dev.tensor([[[0.69103152, 0.25624934],[-0.38448590, 0.03110456],[0.83753252, 0.53786588],[1.15540242, -0.54148245]]]).to_dtype::(); let result = x.leaky_trace().conv2d::<3, 4>(weight.clone()) + bias.leaky_trace().broadcast::<_, Axes2<1, 2>>(); diff --git a/src/tensor_ops/convtrans2d/convtrans2d.cu b/src/tensor_ops/convtrans2d/convtrans2d.cu index 0004a90b8..d6e842704 100644 --- a/src/tensor_ops/convtrans2d/convtrans2d.cu +++ b/src/tensor_ops/convtrans2d/convtrans2d.cu @@ -1,3 +1,5 @@ +#include "cuda_fp16.h" + struct Conv2DOp { size_t stride; size_t padding; @@ -38,6 +40,8 @@ __device__ void unfold_input_into_patches( patches += c * (op.kernel * op.kernel * op.h_out * op.w_out); patches += b * (op.chan_in * op.kernel * op.kernel * op.h_out * op.w_out); + T zero = 0.0; + for (int k1 = 0;k1 < op.kernel;k1++) { const size_t y_ks = oh + op.padding; const size_t y_s = y_ks - k1; @@ -49,7 +53,7 @@ __device__ void unfold_input_into_patches( const size_t x = x_s / op.stride; const bool invalid = k1_invalid || (x_ks < k2 || x_s % op.stride != 0 || x >= op.w_in); - *patches = invalid ? 0.0 : image[y * strides[2] + x * strides[3]]; + *patches = invalid ? zero : image[y * strides[2] + x * strides[3]]; patches += op.h_out * op.w_out; } } @@ -80,11 +84,13 @@ __device__ void unfold_output_into_patches( patches += o * (op.kernel * op.kernel * op.h_in * op.w_in); patches += b * (op.chan_out * op.kernel * op.kernel * op.h_in * op.w_in); + T zero = 0.0; + for (int k1 = 0;k1 < op.kernel;k1++) { const size_t oh = y * op.stride + k1 - op.padding; for (int k2 = 0;k2 < op.kernel;k2++) { const size_t ow = x * op.stride + k2 - op.padding; - *patches = (oh >= op.h_out || ow >= op.w_out) ? 0.0 : image_out[oh * op.w_out + ow]; + *patches = (oh >= op.h_out || ow >= op.w_out) ? zero : image_out[oh * op.w_out + ow]; patches += op.h_in * op.w_in; } } @@ -186,6 +192,13 @@ extern "C" __global__ void SUM_TR_FILTERS( \ sum_transposed_filters(op, filters_tr, filters, strides); \ } +CONV_OP( + __half, + unfold_input_into_patches_f16, + unfold_output_into_patches_f16, + transpose_filters_f16, + sum_transposed_filters_f16 +); CONV_OP( float, unfold_input_into_patches_f32, diff --git a/src/tensor_ops/convtrans2d/cuda_kernel.rs b/src/tensor_ops/convtrans2d/cuda_kernel.rs index 21d88e9d8..9e30d4afc 100644 --- a/src/tensor_ops/convtrans2d/cuda_kernel.rs +++ b/src/tensor_ops/convtrans2d/cuda_kernel.rs @@ -17,6 +17,17 @@ trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "convtrans2d_f16"; + const FNS: &'static [&'static str] = &[ + "unfold_input_into_patches_f16", + "unfold_output_into_patches_f16", + "transpose_filters_f16", + "sum_transposed_filters_f16", + ]; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "convtrans2d_f32"; const FNS: &'static [&'static str] = &[ diff --git a/src/tensor_ops/cos/cos.cu b/src/tensor_ops/cos/cos.cu index 53d891539..25f427391 100644 --- a/src/tensor_ops/cos/cos.cu +++ b/src/tensor_ops/cos/cos.cu @@ -2,11 +2,14 @@ struct CosKernelOp {}; +UNARY_OP(__half, cos_fwd_f16, cos_bwd_f16, CosKernelOp, + cosg(x), + -sing(x)) + UNARY_OP(float, cos_fwd_f32, cos_bwd_f32, CosKernelOp, - cosf(x), - -sinf(x)) + cosg(x), + -sing(x)) UNARY_OP(double, cos_fwd_f64, cos_bwd_f64, CosKernelOp, - cos(x), - -sin(x)) - \ No newline at end of file + cosg(x), + -sing(x)) diff --git a/src/tensor_ops/cos/cuda_kernel.rs b/src/tensor_ops/cos/cuda_kernel.rs index 2c6bd6874..904ba10ef 100644 --- a/src/tensor_ops/cos/cuda_kernel.rs +++ b/src/tensor_ops/cos/cuda_kernel.rs @@ -4,5 +4,13 @@ unsafe impl cudarc::driver::DeviceRepr for super::CosKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/cos.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + super::CosKernelOp, + half::f16, + PTX, + "cos_fwd_f16", + "cos_bwd_f16" +); cuda_unary!(super::CosKernelOp, f32, PTX, "cos_fwd_f32", "cos_bwd_f32"); cuda_unary!(super::CosKernelOp, f64, PTX, "cos_fwd_f64", "cos_bwd_f64"); diff --git a/src/tensor_ops/cos/mod.rs b/src/tensor_ops/cos/mod.rs index b9d13d90d..1a6ea4a53 100644 --- a/src/tensor_ops/cos/mod.rs +++ b/src/tensor_ops/cos/mod.rs @@ -45,7 +45,9 @@ mod tests { #[test] fn test_cos() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().cos(); assert_close_to_literal!(r, [-0.41614684, 0.5403023, 1.0, 0.5403023, -0.41614684]); let g = r.mean().backward(); diff --git a/src/tensor_ops/div/binary_div.cu b/src/tensor_ops/div/binary_div.cu index c287ddc60..3945dba51 100644 --- a/src/tensor_ops/div/binary_div.cu +++ b/src/tensor_ops/div/binary_div.cu @@ -2,13 +2,18 @@ struct BinaryDivOp {}; +BINARY_OP(__half, bdiv_fwd_f16, bdiv_bwd_lhs_f16, bdiv_bwd_rhs_f16, BinaryDivOp, + x / y, + recipg(y), + -x / (y * y)) + BINARY_OP(float, bdiv_fwd_f32, bdiv_bwd_lhs_f32, bdiv_bwd_rhs_f32, BinaryDivOp, x / y, - 1.0 / y, + recipg(y), -x / (y * y)) BINARY_OP(double, bdiv_fwd_f64, bdiv_bwd_lhs_f64, bdiv_bwd_rhs_f64, BinaryDivOp, x / y, - 1.0 / y, + recipg(y), -x / (y * y)) diff --git a/src/tensor_ops/div/cuda_kernel.rs b/src/tensor_ops/div/cuda_kernel.rs index 5653b6942..48407719c 100644 --- a/src/tensor_ops/div/cuda_kernel.rs +++ b/src/tensor_ops/div/cuda_kernel.rs @@ -1,6 +1,8 @@ use super::{BinaryDivKernelOp as Binary, ScalarDivKernelOp as Scalar}; use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary}; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Binary {} @@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {} const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_div.ptx")); const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_div.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(const_df() Scalar, half::f16, SCALAR_PTX, "sdiv_fwd_f16", "sdiv_bwd_f16"); cuda_unary!(const_df() Scalar, f32, SCALAR_PTX, "sdiv_fwd_f32", "sdiv_bwd_f32"); cuda_unary!(const_df() Scalar, f64, SCALAR_PTX, "sdiv_fwd_f64", "sdiv_bwd_f64"); +#[cfg(feature = "f16")] +cuda_binary!( + Binary, + half::f16, + BINARY_PTX, + "bdiv_fwd_f16", + "bdiv_bwd_lhs_f16", + "bdiv_bwd_rhs_f16" +); cuda_binary!( Binary, f32, diff --git a/src/tensor_ops/div/mod.rs b/src/tensor_ops/div/mod.rs index f7abb2ead..53baa75dd 100644 --- a/src/tensor_ops/div/mod.rs +++ b/src/tensor_ops/div/mod.rs @@ -72,6 +72,17 @@ impl, E>, T: Tape> } } +#[cfg(feature = "f16")] +impl, half::f16>, T: Tape> + TryDiv for Tensor +{ + /// See [div] + fn try_div(self, rhs: f32) -> Result { + let scalar = half::f16::from_f32(rhs); + try_unary_op(ScalarDivKernelOp { scalar }, self) + } +} + impl, Rhs> std::ops::Div for Tensor where @@ -94,8 +105,8 @@ mod tests { fn test_div_0d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor(2.0); - let b: Tensor<_, TestDtype, _> = dev.tensor(4.0); + let a = dev.tensor(2.0).to_dtype::(); + let b = dev.tensor(4.0).to_dtype::(); let r = b.leaky_trace() / a.clone(); assert_close_to_literal!(r, 2.0); @@ -107,8 +118,8 @@ mod tests { #[test] fn test_div_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); - let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]); + let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); + let b = dev.tensor([1.0, -1.0, 0.0]).to_dtype::(); let r = b.leaky_trace() / a.clone(); assert_close_to_literal!(r, [1.0, -0.5, 0.0]); @@ -120,10 +131,12 @@ mod tests { #[test] fn test_div_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]); + let a = dev + .tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]) + .to_dtype::(); + let b = dev + .tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]) + .to_dtype::(); let r = b.leaky_trace() / a.clone(); assert_close_to_literal!( @@ -137,15 +150,15 @@ mod tests { assert_close_to_literal!( g.get(&a), [ - [-0.20074181, -2.1961217, -2.7844446], - [-0.42998204, -0.12488105, -0.009292662], + [-0.20074183, -2.19612169, -2.78444433], + [-0.42998207, -0.12488105, -0.00929266] ] ); assert_close_to_literal!( g.get(&b), &[ - [0.25367835, 0.97580016, 1.1111112], - [0.29456818, 0.2377556, 0.1997922], + [0.25367835, 0.97580016, 1.11111104], + [0.29456815, 0.23775560, 0.19979222] ] ); } @@ -153,7 +166,7 @@ mod tests { #[test] fn test_scalar_div_0d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor(1.0); + let x = dev.tensor(1.0).to_dtype::(); let r = x.leaky_trace() / 2.0; assert_close_to_literal!(r, 0.5); let g = r.exp().backward(); @@ -163,7 +176,7 @@ mod tests { #[test] fn test_scalar_div_1d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]); + let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::(); let r = x.leaky_trace() / 2.0; assert_close_to_literal!(r, [0.0, 0.5, 1.0]); let g = r.exp().sum().backward(); @@ -173,7 +186,7 @@ mod tests { #[test] fn test_scalar_div_2d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0; 2]; 3]); + let x = dev.tensor([[1.0; 2]; 3]).to_dtype::(); let r = x.leaky_trace() / 2.0; assert_close_to_literal!(r, [[0.5; 2]; 3]); let g = r.exp().sum().backward(); diff --git a/src/tensor_ops/div/scalar_div.cu b/src/tensor_ops/div/scalar_div.cu index 0c4a6dca3..ff8eebfd2 100644 --- a/src/tensor_ops/div/scalar_div.cu +++ b/src/tensor_ops/div/scalar_div.cu @@ -5,10 +5,14 @@ struct ScalarDivKernelOp { T scalar; }; +UNARY_OP(__half, sdiv_fwd_f16, sdiv_bwd_f16, ScalarDivKernelOp<__half>, + x / op.scalar, + recipg(op.scalar)); + UNARY_OP(float, sdiv_fwd_f32, sdiv_bwd_f32, ScalarDivKernelOp, x / op.scalar, - 1.0 / op.scalar); + recipg(op.scalar)); UNARY_OP(double, sdiv_fwd_f64, sdiv_bwd_f64, ScalarDivKernelOp, x / op.scalar, - 1.0 / op.scalar); + recipg(op.scalar)); diff --git a/src/tensor_ops/dropout/cuda_kernel.rs b/src/tensor_ops/dropout/cuda_kernel.rs index 8e0fd0d9b..0b10bf12b 100644 --- a/src/tensor_ops/dropout/cuda_kernel.rs +++ b/src/tensor_ops/dropout/cuda_kernel.rs @@ -17,6 +17,12 @@ trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "dropout_f16"; + const FNS: &'static [&'static str] = &["dropout_fwd_f16", "dropout_bwd_f16"]; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "dropout_f32"; const FNS: &'static [&'static str] = &["dropout_fwd_f32", "dropout_bwd_f32"]; diff --git a/src/tensor_ops/dropout/dropout.cu b/src/tensor_ops/dropout/dropout.cu index 638c28b97..a52617f44 100644 --- a/src/tensor_ops/dropout/dropout.cu +++ b/src/tensor_ops/dropout/dropout.cu @@ -1,3 +1,5 @@ +#include "cuda_fp16.h" + #define DROPOUT(TYPENAME, FWD, BWD) \ extern "C" __global__ void FWD( \ const TYPENAME prob, \ @@ -10,7 +12,9 @@ extern "C" __global__ void FWD( \ if (i >= numel) { \ return; \ } \ - auto scalar = (noise[i] < prob) ? 0.0 : (1.0 / (1.0 - prob)); \ + TYPENAME zero = 0.0; \ + TYPENAME one = 1.0; \ + TYPENAME scalar = (noise[i] < prob) ? zero : (one / (one - prob)); \ out[i] = inp[i] * scalar; \ } \ extern "C" __global__ void BWD( \ @@ -24,8 +28,11 @@ extern "C" __global__ void BWD( \ if (i >= numel) { \ return; \ } \ - grad_inp[i] += (noise[i] < prob) ? 0.0 : (grad_out[i] / (1.0 - prob)); \ + TYPENAME zero = 0.0; \ + TYPENAME one = 1.0; \ + grad_inp[i] += (noise[i] < prob) ? zero : (grad_out[i] / (one - prob)); \ } +DROPOUT(__half, dropout_fwd_f16, dropout_bwd_f16); DROPOUT(float, dropout_fwd_f32, dropout_bwd_f32); DROPOUT(double, dropout_fwd_f64, dropout_bwd_f64); diff --git a/src/tensor_ops/dropout/mod.rs b/src/tensor_ops/dropout/mod.rs index 890ac3282..3a00406a2 100644 --- a/src/tensor_ops/dropout/mod.rs +++ b/src/tensor_ops/dropout/mod.rs @@ -51,20 +51,20 @@ pub trait DropoutKernel: DeviceStorage { /// random numbers, so the masking is the same for both. pub fn dropout, T: Tape>( t: Tensor, - prob: impl Into, + prob: impl Into, ) -> Tensor { t.dropout(prob) } impl, T: Tape> Tensor { /// See [dropout] - pub fn dropout(self, prob: impl Into) -> Self { + pub fn dropout(self, prob: impl Into) -> Self { self.try_dropout(prob).unwrap() } /// See [dropout] - pub fn try_dropout(self, prob: impl Into) -> Result { + pub fn try_dropout(self, prob: impl Into) -> Result { let seed = self.device.random_u64(); - let prob = prob.into(); + let prob = E::from_f64(prob.into()).unwrap(); let op = DropoutKernelOp { seed, prob }; let (inp, mut tape) = self.split_tape(); let out = inp.device.forward(op, &inp)?; diff --git a/src/tensor_ops/exp/cuda_kernel.rs b/src/tensor_ops/exp/cuda_kernel.rs index 13cef0555..c2082962f 100644 --- a/src/tensor_ops/exp/cuda_kernel.rs +++ b/src/tensor_ops/exp/cuda_kernel.rs @@ -4,5 +4,7 @@ unsafe impl cudarc::driver::DeviceRepr for super::ExpKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/exp.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(df(f(x)) super::ExpKernelOp, half::f16, PTX, "exp_fwd_f16", "exp_bwd_f16"); cuda_unary!(df(f(x)) super::ExpKernelOp, f32, PTX, "exp_fwd_f32", "exp_bwd_f32"); cuda_unary!(df(f(x)) super::ExpKernelOp, f64, PTX, "exp_fwd_f64", "exp_bwd_f64"); diff --git a/src/tensor_ops/exp/exp.cu b/src/tensor_ops/exp/exp.cu index a79293cfe..5f82b5628 100644 --- a/src/tensor_ops/exp/exp.cu +++ b/src/tensor_ops/exp/exp.cu @@ -2,11 +2,15 @@ struct ExpKernelOp {}; +UNARY_OP(__half, exp_fwd_f16, exp_bwd_f16, ExpKernelOp, + expg(x), + y) + UNARY_OP(float, exp_fwd_f32, exp_bwd_f32, ExpKernelOp, - expf(x), + expg(x), y) UNARY_OP(double, exp_fwd_f64, exp_bwd_f64, ExpKernelOp, - exp(x), + expg(x), y) \ No newline at end of file diff --git a/src/tensor_ops/exp/mod.rs b/src/tensor_ops/exp/mod.rs index a8c762a96..e8c7abe70 100644 --- a/src/tensor_ops/exp/mod.rs +++ b/src/tensor_ops/exp/mod.rs @@ -45,7 +45,9 @@ mod tests { #[test] fn test_exp() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().exp(); assert_close_to_literal!(r, [0.13533528, 0.36787945, 1.0, f64::exp(1.0), 7.389056]); let g = r.mean().backward(); diff --git a/src/tensor_ops/gelu/cuda_kernel.rs b/src/tensor_ops/gelu/cuda_kernel.rs index 9e010553a..8982e6702 100644 --- a/src/tensor_ops/gelu/cuda_kernel.rs +++ b/src/tensor_ops/gelu/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for super::GeLUKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/gelu.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(GeLUKernelOp, half::f16, PTX, "gelu_fwd_f16", "gelu_bwd_f16"); cuda_unary!(GeLUKernelOp, f32, PTX, "gelu_fwd_f32", "gelu_bwd_f32"); cuda_unary!(GeLUKernelOp, f64, PTX, "gelu_fwd_f64", "gelu_bwd_f64"); diff --git a/src/tensor_ops/gelu/gelu.cu b/src/tensor_ops/gelu/gelu.cu index 03843238b..bc3e220e9 100644 --- a/src/tensor_ops/gelu/gelu.cu +++ b/src/tensor_ops/gelu/gelu.cu @@ -1,5 +1,4 @@ #include "unary_op_macros.cuh" -#include "cuda_utils.cuh" #define _USE_MATH_DEFINES #include @@ -7,33 +6,44 @@ struct GeLUKernelOp {}; template __device__ T gelu_fwd(T x) { - constexpr T fastCoeff = 0.044715; + T fastCoeff = 0.044715; + T one = 1.0; + T half = 0.5; + T beta = M_2_SQRTPI * M_SQRT1_2; T x_sq = x * x; T x_cube = x_sq * x; T alpha = x + fastCoeff * x_cube; - return 0.5 * x * (1.0 + tanhg(M_2_SQRTPI * M_SQRT1_2 * alpha)); + return half * x * (one + tanhg(beta * alpha)); } template __device__ T gelu_bwd(T x) { - constexpr T kBeta = M_2_SQRTPI * M_SQRT2 * 0.5; - constexpr T fastCoeff = 0.044715; + T one = 1.0; + T three = 3.0; + T half = 0.5; + T fastCoeff = 0.044715; + T kBeta = M_2_SQRTPI * M_SQRT2 * 0.5; T x_sq = x * x; T x_cube = x_sq * x; T inner = kBeta * (x + fastCoeff * x_cube); T tanh_inner = tanhg(inner); - T left = 0.5 * x; - T right = 1.0 + tanh_inner; + T left = half * x; + T right = one + tanh_inner; - T left_derivative = 0.5 * right; + T left_derivative = half * right; - T tanh_derivative = 1.0 - tanh_inner * tanh_inner; - T inner_derivative = kBeta * (1.0 + 3.0 * fastCoeff * x_sq); + T tanh_derivative = one - tanh_inner * tanh_inner; + T inner_derivative = kBeta * (one + three * fastCoeff * x_sq); T right_derivative = left * tanh_derivative * inner_derivative; return left_derivative + right_derivative; } +UNARY_OP(__half, gelu_fwd_f16, gelu_bwd_f16, GeLUKernelOp, + gelu_fwd(x), + gelu_bwd(x) +) + UNARY_OP(float, gelu_fwd_f32, gelu_bwd_f32, GeLUKernelOp, gelu_fwd(x), gelu_bwd(x) diff --git a/src/tensor_ops/gelu/mod.rs b/src/tensor_ops/gelu/mod.rs index 98e208a44..0abd95e8b 100644 --- a/src/tensor_ops/gelu/mod.rs +++ b/src/tensor_ops/gelu/mod.rs @@ -43,7 +43,9 @@ mod tests { #[test] fn test_gelu() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().gelu(); assert_close_to_literal!(r, [-0.04540229, -0.158808, 0.0, 0.841192, 1.9545977]); // NOTE: call .exp() to make sure we cover cases where .gelu() uses the result's gradient diff --git a/src/tensor_ops/huber_error/cuda_kernel.rs b/src/tensor_ops/huber_error/cuda_kernel.rs index 1b9dce952..6c936c2be 100644 --- a/src/tensor_ops/huber_error/cuda_kernel.rs +++ b/src/tensor_ops/huber_error/cuda_kernel.rs @@ -1,11 +1,22 @@ use super::HuberErrorKernelOp as HuberError; use crate::tensor_ops::cuda_kernels::cuda_binary; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for HuberError {} unsafe impl cudarc::driver::DeviceRepr for HuberError {} unsafe impl cudarc::driver::DeviceRepr for HuberError {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/huber_error.ptx")); +#[cfg(feature = "f16")] +cuda_binary!( + HuberError, + half::f16, + PTX, + "huber_fwd_f16", + "huber_bwd_lhs_f16", + "huber_bwd_rhs_f16" +); cuda_binary!( HuberError, f32, diff --git a/src/tensor_ops/huber_error/huber_error.cu b/src/tensor_ops/huber_error/huber_error.cu index 39c90b55a..124b56ae0 100644 --- a/src/tensor_ops/huber_error/huber_error.cu +++ b/src/tensor_ops/huber_error/huber_error.cu @@ -7,19 +7,21 @@ struct HuberErrorOp { template __device__ T op_f(HuberErrorOp op, T x, T y) { - auto a = x - y; + T a = x - y; + T half = 0.5; if (absg(a) < op.delta) { - return a * a * 0.5; + return a * a * half; } else { - return op.delta * (absg(a) - 0.5 * op.delta); + return op.delta * (absg(a) - half * op.delta); } } template __device__ T op_dfdx(HuberErrorOp op, T x, T y) { - auto a = x - y; - if (a == 0.0) { - return 0.0; + T a = x - y; + T zero = 0.0; + if (a == zero) { + return zero; } else if (absg(a) < op.delta) { return a; } else { @@ -32,6 +34,12 @@ __device__ T op_dfdy(HuberErrorOp op, T x, T y) { return -op_dfdx(op, x, y); } +BINARY_OP(__half, huber_fwd_f16, huber_bwd_lhs_f16, huber_bwd_rhs_f16, HuberErrorOp<__half>, + op_f(op, x, y), + op_dfdx(op, x, y), + op_dfdy(op, x, y) +) + BINARY_OP(float, huber_fwd_f32, huber_bwd_lhs_f32, huber_bwd_rhs_f32, HuberErrorOp, op_f(op, x, y), op_dfdx(op, x, y), diff --git a/src/tensor_ops/huber_error/mod.rs b/src/tensor_ops/huber_error/mod.rs index c45a60f69..335c28e29 100644 --- a/src/tensor_ops/huber_error/mod.rs +++ b/src/tensor_ops/huber_error/mod.rs @@ -31,14 +31,14 @@ pub struct HuberErrorKernelOp { pub fn huber_error, T: Tape + Merge, R: Tape>( lhs: Tensor, rhs: Tensor, - delta: impl Into, + delta: impl Into, ) -> Tensor { lhs.huber_error(rhs, delta) } impl, T: Tape> Tensor { /// See [huber_error] - pub fn huber_error>(self, rhs: Tensor, delta: impl Into) -> Self + pub fn huber_error>(self, rhs: Tensor, delta: impl Into) -> Self where T: Merge, { @@ -49,12 +49,12 @@ impl, T: Tape> Tensor { pub fn try_huber_error>( self, rhs: Tensor, - delta: impl Into, + delta: impl Into, ) -> Result where T: Merge, { - let delta = delta.into(); + let delta = E::from_f64(delta.into()).unwrap(); try_binary_op(HuberErrorKernelOp { delta }, self, rhs) } } @@ -66,14 +66,18 @@ mod tests { #[test] fn test_huber_error() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([ - [-0.8424031, 0.6309481, 1.0416432], - [1.325225, 0.5840275, 1.9167633], - ]); - let b: Tensor<_, TestDtype, _> = dev.tensor([ - [0.52022195, 0.578804, 0.17535722], - [0.75429636, 0.66566986, 0.6182751], - ]); + let a = dev + .tensor([ + [-0.8424031, 0.6309481, 1.0416432], + [1.325225, 0.5840275, 1.9167633], + ]) + .to_dtype::(); + let b = dev + .tensor([ + [0.52022195, 0.578804, 0.17535722], + [0.75429636, 0.66566986, 0.6182751], + ]) + .to_dtype::(); let r1 = a.leaky_trace().huber_error(b.leaky_trace(), 1.0); let r2 = a.leaky_trace().huber_error(b.leaky_trace(), 100.0); assert_close_to_literal!( diff --git a/src/tensor_ops/ln/cuda_kernel.rs b/src/tensor_ops/ln/cuda_kernel.rs index 535bbef3f..33a15186b 100644 --- a/src/tensor_ops/ln/cuda_kernel.rs +++ b/src/tensor_ops/ln/cuda_kernel.rs @@ -4,5 +4,13 @@ unsafe impl cudarc::driver::DeviceRepr for super::LnKernelOp {} const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/ln.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + super::LnKernelOp, + half::f16, + PTX_SRC, + "ln_fwd_f16", + "ln_bwd_f16" +); cuda_unary!(super::LnKernelOp, f32, PTX_SRC, "ln_fwd_f32", "ln_bwd_f32"); cuda_unary!(super::LnKernelOp, f64, PTX_SRC, "ln_fwd_f64", "ln_bwd_f64"); diff --git a/src/tensor_ops/ln/ln.cu b/src/tensor_ops/ln/ln.cu index 47e1ae910..8f6dcf278 100644 --- a/src/tensor_ops/ln/ln.cu +++ b/src/tensor_ops/ln/ln.cu @@ -2,11 +2,15 @@ struct LnKernelOp {}; +UNARY_OP(__half, ln_fwd_f16, ln_bwd_f16, LnKernelOp, + logg(x), + recipg(x)) + UNARY_OP(float, ln_fwd_f32, ln_bwd_f32, LnKernelOp, - logf(x), - 1.0 / x) + logg(x), + recipg(x)) UNARY_OP(double, ln_fwd_f64, ln_bwd_f64, LnKernelOp, - log(x), - 1.0 / x) + logg(x), + recipg(x)) \ No newline at end of file diff --git a/src/tensor_ops/ln/mod.rs b/src/tensor_ops/ln/mod.rs index 6dcaedfe6..93f8d9bd4 100644 --- a/src/tensor_ops/ln/mod.rs +++ b/src/tensor_ops/ln/mod.rs @@ -45,12 +45,17 @@ mod tests { #[test] fn test_ln() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().ln(); let r_array = r.array(); assert!(r_array[0].is_nan()); assert!(r_array[1].is_nan()); - assert!(r_array[2..] == [TestDtype::NEG_INFINITY, 0.0, TestDtype::ln(2.0)]); + assert!(r_array[2].is_infinite() && r_array[2].is_sign_negative()); + assert_eq!(r_array[3], TestDtype::default()); + let t: TestDtype = NumCast::from(2.0f64.ln()).unwrap(); + assert_eq!(r_array[4], t); let g = r.mean().backward(); assert_close_to_literal!(g.get(&x), [-0.1, -0.2, f64::INFINITY, 0.2, 0.1]); } diff --git a/src/tensor_ops/log_softmax.rs b/src/tensor_ops/log_softmax.rs index 59557c654..fda0c9d73 100644 --- a/src/tensor_ops/log_softmax.rs +++ b/src/tensor_ops/log_softmax.rs @@ -103,7 +103,9 @@ mod tests { #[test] fn test_log_softmax_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let a = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = a.leaky_trace().log_softmax(); assert_close_to_literal!( r, @@ -125,7 +127,9 @@ mod tests { #[test] fn test_log_softmax_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]); + let a = dev + .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]) + .to_dtype::(); let r = a.leaky_trace().log_softmax::>(); assert_close_to_literal!( r, diff --git a/src/tensor_ops/logsumexp_to.rs b/src/tensor_ops/logsumexp_to.rs index a5e54ab3c..a188070c8 100644 --- a/src/tensor_ops/logsumexp_to.rs +++ b/src/tensor_ops/logsumexp_to.rs @@ -73,7 +73,9 @@ mod tests { #[test] fn test_logsumexp_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let a = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = a.leaky_trace().logsumexp(); assert_close_to_literal!(r, 2.4519143); let g = r.backward(); @@ -86,7 +88,9 @@ mod tests { #[test] fn test_logsumexp_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]); + let a = dev + .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]) + .to_dtype::(); let r = a.leaky_trace().logsumexp::, _>(); assert_close_to_literal!(r, [0.40760595, 7.0509458]); let g = r.mean().backward(); diff --git a/src/tensor_ops/matmul/cpu_kernel.rs b/src/tensor_ops/matmul/cpu_kernel.rs index 4a08face8..d2faa19de 100644 --- a/src/tensor_ops/matmul/cpu_kernel.rs +++ b/src/tensor_ops/matmul/cpu_kernel.rs @@ -17,12 +17,8 @@ use cblas_sys::{ ))] use matrixmultiply::{dgemm, sgemm}; -#[cfg(not(any( - feature = "cpu-seq-matmul", - feature = "cpu-par-matmul", - feature = "cpu-mkl-matmul" -)))] -fn gemm( +#[allow(unused)] +fn naive_gemm( (m, k, n): (M, K, N), ap: *const F, a_strides: [usize; 2], @@ -57,6 +53,22 @@ pub(crate) trait MatMulImpl { ); } +#[cfg(feature = "f16")] +impl MatMulImpl for Cpu { + #[inline] + fn matmul( + (m, k, n): (M, K, N), + ap: *const half::f16, + a_strides: [usize; 2], + bp: *const half::f16, + b_strides: [usize; 2], + cp: *mut half::f16, + c_strides: [usize; 2], + ) { + naive_gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides); + } +} + impl MatMulImpl for Cpu { #[inline] fn matmul( @@ -103,7 +115,7 @@ impl MatMulImpl for Cpu { feature = "cpu-par-matmul", feature = "cpu-mkl-matmul" )))] - gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides); + naive_gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides); } } @@ -156,7 +168,7 @@ impl MatMulImpl for Cpu { feature = "cpu-par-matmul", feature = "cpu-mkl-matmul" )))] - gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides); + naive_gemm((m, k, n), ap, a_strides, bp, b_strides, cp, c_strides); } } diff --git a/src/tensor_ops/matmul/mod.rs b/src/tensor_ops/matmul/mod.rs index a59f3a0d3..954857b4f 100644 --- a/src/tensor_ops/matmul/mod.rs +++ b/src/tensor_ops/matmul/mod.rs @@ -365,14 +365,17 @@ mod tests { fn test_matmul_normal() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([ - [0.5086, 0.5234, 0.2684], - [0.8075, 0.8437, 0.9951], - [0.0774, 0.7539, 0.8894], - [0.8119, 0.2693, 0.7249], - ]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.4651, 0.9106], [0.3360, 0.5534], [0.8092, 0.3827]]); + let a = dev + .tensor([ + [0.5086, 0.5234, 0.2684], + [0.8075, 0.8437, 0.9951], + [0.0774, 0.7539, 0.8894], + [0.8119, 0.2693, 0.7249], + ]) + .to_dtype::(); + let b = dev + .tensor([[0.4651, 0.9106], [0.3360, 0.5534], [0.8092, 0.3827]]) + .to_dtype::(); let r = a.leaky_trace().matmul(b.clone()); assert_close_to_literal!( r, @@ -435,7 +438,7 @@ mod tests { } let gs = r.sum().backward(); let a_grad = gs.get(&a).array(); - let mut sub_bs_summed = [[0.0; 2]; 3]; + let mut sub_bs_summed = [[Default::default(); 2]; 3]; for i in 0..N { let sub_a = dev.tensor(a_array[i]); let sub_gs = sub_a.leaky_trace().matmul(b.clone()).sum().backward(); @@ -524,9 +527,10 @@ mod tests { fn test_matmul_vec_normal() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([0.7296, 0.3974, 0.9487]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.7804, 0.5540], [0.5378, 0.8401], [0.5042, 0.8604]]); + let a = dev.tensor([0.7296, 0.3974, 0.9487]).to_dtype::(); + let b = dev + .tensor([[0.7804, 0.5540], [0.5378, 0.8401], [0.5042, 0.8604]]) + .to_dtype::(); let r = a.leaky_trace().matmul(b.clone()); assert_close_to_literal!(r, [1.261436, 1.5543157]); let g = r.exp().mean().backward(); @@ -544,9 +548,10 @@ mod tests { #[test] fn test_matmul_vec_transpose() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([0.7296, 0.3974, 0.9487]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.7804, 0.5378, 0.5042], [0.5540, 0.8401, 0.8604]]); + let a = dev.tensor([0.7296, 0.3974, 0.9487]).to_dtype::(); + let b = dev + .tensor([[0.7804, 0.5378, 0.5042], [0.5540, 0.8401, 0.8604]]) + .to_dtype::(); let r = a.leaky_trace().matmul(b.leaky_trace().permute()); assert_close_to_literal!(r, [1.261436, 1.5543157]); let g = r.exp().mean().backward(); @@ -563,9 +568,12 @@ mod tests { #[test] fn test_vecvec() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([-1.5333828, 0.6136148, -0.77502704, -1.0014728, -2.0131118]); - let b: Tensor<_, TestDtype, _> = dev.tensor([0.43068963, -0.9757187, -0.50650096]); + let a = dev + .tensor([-1.5333828, 0.6136148, -0.77502704, -1.0014728, -2.0131118]) + .to_dtype::(); + let b = dev + .tensor([0.43068963, -0.9757187, -0.50650096]) + .to_dtype::(); let c = a.leaky_trace().matmul(b.clone()); let c_t = b.leaky_trace().matmul(a.clone()).permute(); assert_eq!(c.array(), c_t.array()); @@ -592,8 +600,8 @@ mod tests { #[test] fn test_small_matmul_vv() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]); - let b: Tensor<_, TestDtype, _> = dev.tensor([2.0]); + let a = dev.tensor([0.5]).to_dtype::(); + let b = dev.tensor([2.0]).to_dtype::(); let c = a.leaky_trace().matmul(b.clone()); assert_close_to_literal!(c, [[1.0]]); let g = c.exp().sum().backward(); @@ -606,8 +614,8 @@ mod tests { let dev: TestDevice = Default::default(); // 1 * 1x1 - let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0]]); + let a = dev.tensor([0.5]).to_dtype::(); + let b = dev.tensor([[2.0]]).to_dtype::(); let c = a.leaky_trace().matmul(b.clone()); assert_close_to_literal!(c, [1.0]); let g = c.exp().sum().backward(); @@ -622,8 +630,8 @@ mod tests { assert_close_to_literal!(g.get(&b), [[1.3591409]]); // 1 * 1x2 - let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0, 4.0]]); + let a = dev.tensor([0.5]).to_dtype::(); + let b = dev.tensor([[2.0, 4.0]]).to_dtype::(); let c = a.leaky_trace().matmul(b.clone()); let e: [f64; 2] = [1.0, 2.0]; assert_close_to_literal!(c, e); @@ -632,8 +640,8 @@ mod tests { assert_close_to_literal!(g.get(&b), [[1.3591409, 3.694528]]); // 1 * 1x2 (permuted) - let a: Tensor<_, TestDtype, _> = dev.tensor([0.5]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0], [4.0]]); + let a = dev.tensor([0.5]).to_dtype::(); + let b = dev.tensor([[2.0], [4.0]]).to_dtype::(); let c = a.leaky_trace().matmul(b.leaky_trace().permute()); assert_close_to_literal!(c, e); let g = c.exp().sum().backward(); @@ -647,8 +655,8 @@ mod tests { { // 1x1 * 1x1 - let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5]]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0]]); + let a = dev.tensor([[0.5]]).to_dtype::(); + let b = dev.tensor([[2.0]]).to_dtype::(); let c = a.leaky_trace().matmul(b.clone()); assert_close_to_literal!(c, [[1.0]]); let g = c.exp().sum().backward(); @@ -658,8 +666,8 @@ mod tests { { // 1x2 * 2x1 - let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5, 0.1]]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0], [4.0]]); + let a = dev.tensor([[0.5, 0.1]]).to_dtype::(); + let b = dev.tensor([[2.0], [4.0]]).to_dtype::(); let c = a.leaky_trace().matmul(b.clone()); assert_close_to_literal!(c, [[1.4]]); let g = c.exp().sum().backward(); @@ -669,8 +677,8 @@ mod tests { { // 1x2 (permuted) * 2x1 - let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5], [0.1]]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0], [4.0]]); + let a = dev.tensor([[0.5], [0.1]]).to_dtype::(); + let b = dev.tensor([[2.0], [4.0]]).to_dtype::(); let c = a.leaky_trace().permute().matmul(b.clone()); assert_close_to_literal!(c, [[1.4]]); let g = c.exp().sum().backward(); @@ -680,8 +688,8 @@ mod tests { { // 1x2 * 2x1 (permuted) - let a: Tensor<_, TestDtype, _> = dev.tensor([[0.5, 0.1]]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[2.0, 4.0]]); + let a = dev.tensor([[0.5, 0.1]]).to_dtype::(); + let b = dev.tensor([[2.0, 4.0]]).to_dtype::(); let c = a.leaky_trace().matmul(b.leaky_trace().permute()); assert_close_to_literal!(c, [[1.4]]); let g = c.exp().sum().backward(); diff --git a/src/tensor_ops/max_to/cuda_kernel.rs b/src/tensor_ops/max_to/cuda_kernel.rs index 266f62a44..f58aa432b 100644 --- a/src/tensor_ops/max_to/cuda_kernel.rs +++ b/src/tensor_ops/max_to/cuda_kernel.rs @@ -16,6 +16,13 @@ trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const INIT: half::f16 = half::f16::NEG_INFINITY; + const MOD: &'static str = "max_f16"; + const FNS: &'static [&'static str] = &["max_to_fwd_f16", "max_to_bwd_f16", "fill_with_f16"]; +} + impl HasCudaKernel for Cuda { const INIT: f32 = f32::NEG_INFINITY; const MOD: &'static str = "max_f32"; diff --git a/src/tensor_ops/max_to/max_to.cu b/src/tensor_ops/max_to/max_to.cu index 846245422..99c173356 100644 --- a/src/tensor_ops/max_to/max_to.cu +++ b/src/tensor_ops/max_to/max_to.cu @@ -1,23 +1,5 @@ #include "cuda_utils.cuh" -// atomicMax is not implemented for floats, -// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -__device__ __forceinline__ float atomicMaxf(float * addr, float value) { - if (signbit(value)) { - return __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value))); - } else { - return __int_as_float(atomicMax((int *)addr, __float_as_int(value))); - } -} - -__device__ __forceinline__ double atomicMaxf(double * addr, double value) { - if (signbit(value)) { - return __longlong_as_double(atomicMin((unsigned long long int *)addr, __double_as_longlong(value))); - } else { - return __longlong_as_double(atomicMax((long long int *)addr, __double_as_longlong(value))); - } -} - // Efficiently computes the max of each chunk in "data" of size chunk_len, and // stores the maximums in out[i / chunk_len] template @@ -140,5 +122,6 @@ extern "C" __global__ void BWD( \ max_to_bwd(numel, num_dims, elems_per_thread, info, inp, grad_inp, out, grad_out); \ } +MAX(__half, max_to_fwd_f16, max_to_bwd_f16); MAX(float, max_to_fwd_f32, max_to_bwd_f32); MAX(double, max_to_fwd_f64, max_to_bwd_f64); diff --git a/src/tensor_ops/max_to/mod.rs b/src/tensor_ops/max_to/mod.rs index f9537384e..934575812 100644 --- a/src/tensor_ops/max_to/mod.rs +++ b/src/tensor_ops/max_to/mod.rs @@ -90,7 +90,9 @@ mod tests { #[test] fn test_max_axis_0_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]]); + let t = dev + .tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]]) + .to_dtype::(); let r = t.leaky_trace().max::<_, Axis<0>>(); assert_close_to_literal!(r, [3.0, 2.0, 2.0]); let g = r.exp().mean().backward(); @@ -103,7 +105,9 @@ mod tests { #[test] fn test_max_axis_1_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]]); + let t = dev + .tensor([[1.0, 2.0, 2.0], [3.0, -2.0, 2.0]]) + .to_dtype::(); let r = t.leaky_trace().max::<_, Axis<1>>(); assert_close_to_literal!(r, [2.0, 3.0]); let g = r.sum().backward(); @@ -113,7 +117,7 @@ mod tests { #[test] fn test_max_axes_3d_to_1d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.sample_normal::>(); + let t: Tensor, TestDtype, _> = dev.sample_normal(); let r = t.leaky_trace().max::, _>(); let r2 = t.leaky_trace().max::<_, Axis<0>>().max::<_, Axis<0>>(); assert_close_to_tensor!(r, r2); @@ -125,8 +129,9 @@ mod tests { #[test] fn test_max_negative_zero() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = - dev.tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]]); + let t = dev + .tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]]) + .to_dtype::(); let r = t.leaky_trace().max::<_, Axis<1>>(); assert_close_to_literal!(r, [0.0, 0.0, -0.0, 0.0]); let g = r.sum().backward(); diff --git a/src/tensor_ops/maximum/cuda_kernel.rs b/src/tensor_ops/maximum/cuda_kernel.rs index 8afafe759..7e71e6b62 100644 --- a/src/tensor_ops/maximum/cuda_kernel.rs +++ b/src/tensor_ops/maximum/cuda_kernel.rs @@ -5,6 +5,15 @@ unsafe impl cudarc::driver::DeviceRepr for Max {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/maximum.ptx")); +#[cfg(feature = "f16")] +cuda_binary!( + Max, + half::f16, + PTX, + "maximum_fwd_f16", + "maximum_bwd_lhs_f16", + "maximum_bwd_rhs_f16" +); cuda_binary!( Max, f32, diff --git a/src/tensor_ops/maximum/maximum.cu b/src/tensor_ops/maximum/maximum.cu index 81735c71a..8066e78bf 100644 --- a/src/tensor_ops/maximum/maximum.cu +++ b/src/tensor_ops/maximum/maximum.cu @@ -17,6 +17,12 @@ __device__ T op_dfdy(T x, T y) { return (x > y) ? 0.0 : ((x < y) ? 1.0 : 0.5); } +BINARY_OP(__half, maximum_fwd_f16, maximum_bwd_lhs_f16, maximum_bwd_rhs_f16, MaximumKernalOp, + op_f(x, y), + op_dfdx(x, y), + op_dfdy(x, y) +) + BINARY_OP(float, maximum_fwd_f32, maximum_bwd_lhs_f32, maximum_bwd_rhs_f32, MaximumKernalOp, op_f(x, y), op_dfdx(x, y), diff --git a/src/tensor_ops/maximum/mod.rs b/src/tensor_ops/maximum/mod.rs index fbe3bbf96..5b514ddc8 100644 --- a/src/tensor_ops/maximum/mod.rs +++ b/src/tensor_ops/maximum/mod.rs @@ -54,8 +54,12 @@ mod tests { #[test] fn test_maximum() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]]); + let a = dev + .tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]]) + .to_dtype::(); let result = a.leaky_trace().maximum(b.clone()); assert_close_to_literal!(result, [[0.0, 0.0, 1.0], [3.0, 4.0, 5.0]]); diff --git a/src/tensor_ops/mean_to.rs b/src/tensor_ops/mean_to.rs index 911ae4544..0012dd856 100644 --- a/src/tensor_ops/mean_to.rs +++ b/src/tensor_ops/mean_to.rs @@ -71,7 +71,7 @@ mod tests { #[test] fn test_mean_1d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); + let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); let r = t.leaky_trace().mean(); assert_close_to_literal!(r, 2.0); // NOTE: .exp() so we cover the case where .mean() has to use result grad. @@ -82,7 +82,9 @@ mod tests { #[test] fn test_mean_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + .to_dtype::(); let r = t.leaky_trace().mean(); assert_close_to_literal!(r, 3.5); let g = r.backward(); @@ -92,7 +94,7 @@ mod tests { #[test] fn test_mean_3d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.ones::>(); + let t: Tensor, TestDtype, _> = dev.ones(); let r = t.leaky_trace().mean(); assert_close_to_literal!(r, 1.0); let g = r.backward(); @@ -102,7 +104,9 @@ mod tests { #[test] fn test_mean_axis_0_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]) + .to_dtype::(); let r = t.leaky_trace().mean::, _>(); assert_close_to_literal!(r, [-0.5, 3.0, -1.5]); let g = r.exp().mean().backward(); @@ -112,7 +116,9 @@ mod tests { #[test] fn test_mean_axis_1_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]) + .to_dtype::(); let r = t.leaky_trace().mean::, _>(); assert_close_to_literal!(r, [2.0, -4.0 / 3.0]); let g = r.exp().mean().backward(); diff --git a/src/tensor_ops/min_to/cuda_kernel.rs b/src/tensor_ops/min_to/cuda_kernel.rs index 6efcb825e..fb7d1f07b 100644 --- a/src/tensor_ops/min_to/cuda_kernel.rs +++ b/src/tensor_ops/min_to/cuda_kernel.rs @@ -16,6 +16,13 @@ trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const INIT: half::f16 = half::f16::INFINITY; + const MOD: &'static str = "min_f16"; + const FNS: &'static [&'static str] = &["min_to_fwd_f16", "min_to_bwd_f16", "fill_with_f16"]; +} + impl HasCudaKernel for Cuda { const INIT: f32 = f32::INFINITY; const MOD: &'static str = "min_f32"; diff --git a/src/tensor_ops/min_to/min_to.cu b/src/tensor_ops/min_to/min_to.cu index ffd860729..8ba24c158 100644 --- a/src/tensor_ops/min_to/min_to.cu +++ b/src/tensor_ops/min_to/min_to.cu @@ -1,23 +1,5 @@ #include "cuda_utils.cuh" -// atomicMax is not implemented for floats, -// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -__device__ __forceinline__ float atomicMinf(float * addr, float value) { - if (signbit(value)) { - return __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value))); - } else { - return __int_as_float(atomicMin((int *)addr, __float_as_int(value))); - } -} - -__device__ __forceinline__ double atomicMinf(double * addr, double value) { - if (signbit(value)) { - return __longlong_as_double(atomicMax((unsigned long long int *)addr, __double_as_longlong(value))); - } else { - return __longlong_as_double(atomicMin((long long int *)addr, __double_as_longlong(value))); - } -} - // Efficiently computes the min of each chunk in "data" of size chunk_len, and // stores the minimums in out[i / chunk_len] template @@ -140,5 +122,6 @@ extern "C" __global__ void BWD( \ min_to_bwd(numel, num_dims, elems_per_thread, info, inp, grad_inp, out, grad_out); \ } +MIN(__half, min_to_fwd_f16, min_to_bwd_f16); MIN(float, min_to_fwd_f32, min_to_bwd_f32); MIN(double, min_to_fwd_f64, min_to_bwd_f64); diff --git a/src/tensor_ops/min_to/mod.rs b/src/tensor_ops/min_to/mod.rs index 690c78de3..f392c4c8a 100644 --- a/src/tensor_ops/min_to/mod.rs +++ b/src/tensor_ops/min_to/mod.rs @@ -90,7 +90,9 @@ mod tests { #[test] fn test_min_axis_0_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]]); + let t = dev + .tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]]) + .to_dtype::(); let r = t.leaky_trace().min::, _>(); assert_close_to_literal!(r, [1.0, -2.0, 2.0]); let g = r.exp().mean().backward(); @@ -103,7 +105,9 @@ mod tests { #[test] fn test_min_axis_1_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]]); + let t = dev + .tensor([[1.0, 1.0, 2.0], [3.0, -2.0, 2.0]]) + .to_dtype::(); let r = t.leaky_trace().min::, _>(); assert_close_to_literal!(r, [1.0, -2.0]); let g = r.sum().backward(); @@ -113,7 +117,7 @@ mod tests { #[test] fn test_min_axes_3d_to_1d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.sample_normal::>(); + let t: Tensor, TestDtype, _> = dev.sample_normal(); let r = t.leaky_trace().min::, _>(); let r2 = t.leaky_trace().min::, _>().min::, _>(); assert_close_to_tensor!(r, r2); @@ -125,8 +129,9 @@ mod tests { #[test] fn test_min_negative_zero() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = - dev.tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]]); + let t = dev + .tensor([[-0.0, 0.0], [0.0, -0.0], [-1.0, -0.0], [-1.0, 0.0]]) + .to_dtype::(); let r = t.leaky_trace().min::<_, Axis<1>>(); assert_close_to_literal!(r, [-0.0, -0.0, -1.0, -1.0]); let g = r.sum().backward(); diff --git a/src/tensor_ops/minimum/cuda_kernel.rs b/src/tensor_ops/minimum/cuda_kernel.rs index 4368149f7..deb9a8f70 100644 --- a/src/tensor_ops/minimum/cuda_kernel.rs +++ b/src/tensor_ops/minimum/cuda_kernel.rs @@ -5,6 +5,15 @@ unsafe impl cudarc::driver::DeviceRepr for super::MinimumKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/minimum.ptx")); +#[cfg(feature = "f16")] +cuda_binary!( + Min, + half::f16, + PTX, + "minimum_fwd_f16", + "minimum_bwd_lhs_f16", + "minimum_bwd_rhs_f16" +); cuda_binary!( Min, f32, diff --git a/src/tensor_ops/minimum/minimum.cu b/src/tensor_ops/minimum/minimum.cu index 058b999e6..8417bd4e0 100644 --- a/src/tensor_ops/minimum/minimum.cu +++ b/src/tensor_ops/minimum/minimum.cu @@ -17,6 +17,12 @@ __device__ T op_dfdy(T x, T y) { return (x < y) ? 0.0 : ((x > y) ? 1.0 : 0.5); } +BINARY_OP(__half, minimum_fwd_f16, minimum_bwd_lhs_f16, minimum_bwd_rhs_f16, MinimumKernelOp, + op_f(x, y), + op_dfdx(x, y), + op_dfdy(x, y) +) + BINARY_OP(float, minimum_fwd_f32, minimum_bwd_lhs_f32, minimum_bwd_rhs_f32, MinimumKernelOp, op_f(x, y), op_dfdx(x, y), diff --git a/src/tensor_ops/minimum/mod.rs b/src/tensor_ops/minimum/mod.rs index 529d7b792..c0d766082 100644 --- a/src/tensor_ops/minimum/mod.rs +++ b/src/tensor_ops/minimum/mod.rs @@ -53,8 +53,12 @@ mod tests { #[test] fn test_minimum() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]]); - let b: Tensor<_, TestDtype, _> = dev.tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]]); + let a = dev + .tensor([[-1.0, 0.0, 1.0], [3.0, 4.0, -5.0]]) + .to_dtype::(); + let b = dev + .tensor([[0.0, 0.0, -1.0], [3.0, -4.0, 5.0]]) + .to_dtype::(); let result = a.leaky_trace().minimum(b.clone()); assert_close_to_literal!(result, [[-1., 0., -1.], [3., -4., -5.]]); diff --git a/src/tensor_ops/mul/binary_mul.cu b/src/tensor_ops/mul/binary_mul.cu index d14056e64..881722e20 100644 --- a/src/tensor_ops/mul/binary_mul.cu +++ b/src/tensor_ops/mul/binary_mul.cu @@ -2,6 +2,11 @@ struct BinaryMulKernalOp {}; +BINARY_OP(__half, bmul_fwd_f16, bmul_bwd_lhs_f16, bmul_bwd_rhs_f16, BinaryMulKernalOp, + x * y, + y, + x) + BINARY_OP(float, bmul_fwd_f32, bmul_bwd_lhs_f32, bmul_bwd_rhs_f32, BinaryMulKernalOp, x * y, y, diff --git a/src/tensor_ops/mul/cuda_kernel.rs b/src/tensor_ops/mul/cuda_kernel.rs index 9f008f828..9eca6a4fe 100644 --- a/src/tensor_ops/mul/cuda_kernel.rs +++ b/src/tensor_ops/mul/cuda_kernel.rs @@ -1,6 +1,8 @@ use super::{BinaryMulKernelOp as Binary, ScalarMulKernelOp as Scalar}; use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary}; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Binary {} @@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {} const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_mul.ptx")); const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_mul.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(const_df() Scalar, half::f16, SCALAR_PTX, "smul_fwd_f16", "smul_bwd_f16"); cuda_unary!(const_df() Scalar, f32, SCALAR_PTX, "smul_fwd_f32", "smul_bwd_f32"); cuda_unary!(const_df() Scalar, f64, SCALAR_PTX, "smul_fwd_f64", "smul_bwd_f64"); +#[cfg(feature = "f16")] +cuda_binary!( + Binary, + half::f16, + BINARY_PTX, + "bmul_fwd_f16", + "bmul_bwd_lhs_f16", + "bmul_bwd_rhs_f16" +); cuda_binary!( Binary, f32, diff --git a/src/tensor_ops/mul/mod.rs b/src/tensor_ops/mul/mod.rs index ff45b2c47..909bbd460 100644 --- a/src/tensor_ops/mul/mod.rs +++ b/src/tensor_ops/mul/mod.rs @@ -68,6 +68,16 @@ impl, E>, T: Tape> } } +#[cfg(feature = "f16")] +impl, half::f16>, T: Tape> + TryMul for Tensor +{ + fn try_mul(self, rhs: f32) -> Result { + let scalar = half::f16::from_f32(rhs); + try_unary_op(ScalarMulKernelOp { scalar }, self) + } +} + impl, Rhs> std::ops::Mul for Tensor where @@ -85,8 +95,8 @@ mod tests { #[test] fn test_mul_0d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor(2.0); - let b: Tensor<_, TestDtype, _> = dev.tensor(3.0); + let a = dev.tensor(2.0).to_dtype::(); + let b = dev.tensor(3.0).to_dtype::(); let r = a.leaky_trace() * b.clone(); assert_close_to_literal!(r, 6.0); @@ -98,8 +108,8 @@ mod tests { #[test] fn test_mul_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); - let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]); + let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); + let b = dev.tensor([1.0, -1.0, 0.0]).to_dtype::(); let r = a.leaky_trace() * b.clone(); assert_close_to_literal!(r, [1.0, -2.0, 0.0]); @@ -111,10 +121,12 @@ mod tests { #[test] fn test_mul_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]); + let a = dev + .tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]) + .to_dtype::(); + let b = dev + .tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]) + .to_dtype::(); let r = a.leaky_trace() * b.clone(); assert_close_to_literal!( @@ -144,7 +156,7 @@ mod tests { #[test] fn test_scalar_mul_0d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor(1.0); + let x = dev.tensor(1.0).to_dtype::(); let r = x.leaky_trace() * 0.5; assert_close_to_literal!(r, 0.5); let g = r.exp().backward(); @@ -154,7 +166,7 @@ mod tests { #[test] fn test_scalar_mul_1d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]); + let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::(); let r = x.leaky_trace() * 0.5; assert_close_to_literal!(r, [0.0, 0.5, 1.0]); let g = r.exp().sum().backward(); @@ -164,7 +176,7 @@ mod tests { #[test] fn test_scalar_mul_2d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0; 2]; 3]); + let x = dev.tensor([[1.0; 2]; 3]).to_dtype::(); let r = x.leaky_trace() * 0.5; assert_close_to_literal!(r, [[0.5; 2]; 3]); let g = r.exp().sum().backward(); diff --git a/src/tensor_ops/mul/scalar_mul.cu b/src/tensor_ops/mul/scalar_mul.cu index 498decf7c..0062eb1cf 100644 --- a/src/tensor_ops/mul/scalar_mul.cu +++ b/src/tensor_ops/mul/scalar_mul.cu @@ -5,6 +5,10 @@ struct ScalarMulKernelOp { F scalar; }; +UNARY_OP(__half, smul_fwd_f16, smul_bwd_f16, ScalarMulKernelOp<__half>, + x * op.scalar, + op.scalar); + UNARY_OP(float, smul_fwd_f32, smul_bwd_f32, ScalarMulKernelOp, x * op.scalar, op.scalar); diff --git a/src/tensor_ops/nans_to/cuda_kernel.rs b/src/tensor_ops/nans_to/cuda_kernel.rs index 1fd8574bd..dae060434 100644 --- a/src/tensor_ops/nans_to/cuda_kernel.rs +++ b/src/tensor_ops/nans_to/cuda_kernel.rs @@ -1,10 +1,20 @@ use super::NansToKernelOp as NansTo; use crate::tensor_ops::cuda_kernels::cuda_unary; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for NansTo {} unsafe impl cudarc::driver::DeviceRepr for NansTo {} unsafe impl cudarc::driver::DeviceRepr for NansTo {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/nans_to.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + NansTo, + half::f16, + PTX, + "nans_to_fwd_f16", + "nans_to_bwd_f16" +); cuda_unary!(NansTo, f32, PTX, "nans_to_fwd_f32", "nans_to_bwd_f32"); cuda_unary!(NansTo, f64, PTX, "nans_to_fwd_f64", "nans_to_bwd_f64"); diff --git a/src/tensor_ops/nans_to/mod.rs b/src/tensor_ops/nans_to/mod.rs index fa98308da..2909238df 100644 --- a/src/tensor_ops/nans_to/mod.rs +++ b/src/tensor_ops/nans_to/mod.rs @@ -24,19 +24,19 @@ pub struct NansToKernelOp(E); /// ``` pub fn nans_to, E>, T: Tape>( t: Tensor, - value: impl Into, + value: impl Into, ) -> Tensor { t.nans_to(value) } impl, E>, T: Tape> Tensor { /// See [nans_to] - pub fn nans_to(self, value: impl Into) -> Self { + pub fn nans_to(self, value: impl Into) -> Self { self.try_nans_to(value).unwrap() } /// See [nans_to] - pub fn try_nans_to(self, value: impl Into) -> Result { - let value = value.into(); + pub fn try_nans_to(self, value: impl Into) -> Result { + let value = E::from_f64(value.into()).unwrap(); try_unary_op(NansToKernelOp(value), self) } } @@ -48,7 +48,9 @@ mod tests { #[test] fn test_nans_1d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, TestDtype::NAN, -TestDtype::NAN, 4.0]); + let t = dev + .tensor([1.0, f64::NAN, -f64::NAN, 4.0]) + .to_dtype::(); let r = t.leaky_trace().nans_to(0.0); assert_close_to_literal!(r, [1.0, 0.0, 0.0, 4.0]); // NOTE: .exp() so we cover case where nans_to() needs to use result grad diff --git a/src/tensor_ops/nans_to/nans_to.cu b/src/tensor_ops/nans_to/nans_to.cu index 9d27d1f10..6842b6d13 100644 --- a/src/tensor_ops/nans_to/nans_to.cu +++ b/src/tensor_ops/nans_to/nans_to.cu @@ -5,11 +5,15 @@ struct NansToKernelOp { F x; }; +UNARY_OP(__half, nans_to_fwd_f16, nans_to_bwd_f16, NansToKernelOp<__half>, + isnang(x) ? op.x : x, + isnang(x) ? 0.0 : 1.0) + UNARY_OP(float, nans_to_fwd_f32, nans_to_bwd_f32, NansToKernelOp, - isnan(x) ? op.x : x, - isnan(x) ? 0.0 : 1.0) + isnang(x) ? op.x : x, + isnang(x) ? 0.0 : 1.0) UNARY_OP(double, nans_to_fwd_f64, nans_to_bwd_f64, NansToKernelOp, - isnan(x) ? op.x : x, - isnan(x) ? 0.0 : 1.0) + isnang(x) ? op.x : x, + isnang(x) ? 0.0 : 1.0) \ No newline at end of file diff --git a/src/tensor_ops/negate/cuda_kernel.rs b/src/tensor_ops/negate/cuda_kernel.rs index 752d841cc..a6065e555 100644 --- a/src/tensor_ops/negate/cuda_kernel.rs +++ b/src/tensor_ops/negate/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for NegateKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/negate.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(const_df() NegateKernelOp, half::f16, PTX, "negate_fwd_f16", "negate_bwd_f16"); cuda_unary!(const_df() NegateKernelOp, f32, PTX, "negate_fwd_f32", "negate_bwd_f32"); cuda_unary!(const_df() NegateKernelOp, f64, PTX, "negate_fwd_f64", "negate_bwd_f64"); diff --git a/src/tensor_ops/negate/mod.rs b/src/tensor_ops/negate/mod.rs index 39fe4529d..bc36bb95b 100644 --- a/src/tensor_ops/negate/mod.rs +++ b/src/tensor_ops/negate/mod.rs @@ -51,7 +51,7 @@ mod tests { #[test] fn test_1d_neg() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, 0.0, 5.0]); + let a = dev.tensor([-2.0, 0.0, 5.0]).to_dtype::(); let r = -(a.leaky_trace()); assert_close_to_literal!(r, [2.0, 0.0, -5.0]); // NOTE: .exp() so we can make sure neg is using result grad properly diff --git a/src/tensor_ops/negate/negate.cu b/src/tensor_ops/negate/negate.cu index 701e0403b..f522cf93b 100644 --- a/src/tensor_ops/negate/negate.cu +++ b/src/tensor_ops/negate/negate.cu @@ -2,6 +2,10 @@ struct NegateKernelOp {}; +UNARY_OP(__half, negate_fwd_f16, negate_bwd_f16, NegateKernelOp, + -x, + -1.0) + UNARY_OP(float, negate_fwd_f32, negate_bwd_f32, NegateKernelOp, -x, -1.0) diff --git a/src/tensor_ops/normalize.rs b/src/tensor_ops/normalize.rs index 2da3db0ff..3e929dcf5 100644 --- a/src/tensor_ops/normalize.rs +++ b/src/tensor_ops/normalize.rs @@ -17,14 +17,14 @@ use super::{BroadcastTo, Device, MeanTo, TryAdd, TryDiv, TrySub}; /// ``` pub fn normalize, E: Dtype, D: Device, T: Tape>( t: Tensor, - epsilon: impl Into, + epsilon: impl Into, ) -> Tensor { t.normalize::(epsilon) } impl, T: Tape> Tensor { /// See [normalize] - pub fn normalize(self, epsilon: impl Into) -> Self + pub fn normalize(self, epsilon: impl Into) -> Self where S: ReduceShape, { @@ -34,7 +34,7 @@ impl, T: Tape> Tensor { /// See [normalize] pub fn try_normalize( self, - epsilon: impl Into, + epsilon: impl Into, ) -> Result::Err> where S: ReduceShape, @@ -46,7 +46,7 @@ impl, T: Tape> Tensor { .retaped::() .try_square()? .try_mean::<_, Ax>()? - .try_add(epsilon.into())? + .try_add(E::from_f64(epsilon.into()).unwrap())? .try_sqrt()?; centered.try_div(std.try_broadcast_like(&shape)?) } @@ -60,7 +60,7 @@ mod tests { #[test] fn test_1d_normalize_axis_last() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, 0.0, 5.0]); + let a = dev.tensor([-2.0, 0.0, 5.0]).to_dtype::(); let r = a.leaky_trace().normalize(1e-5); assert_close_to_literal!(&r, [-1.0190487, -0.3396829, 1.3587316]); // NOTE: .exp() so we can make sure normalize is using result grad properly @@ -71,7 +71,9 @@ mod tests { #[test] fn test_2d_normalize_axis_last() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, 0.0, 5.0], [1.0, 2.0, 3.0]]); + let a = dev + .tensor([[-2.0, 0.0, 5.0], [1.0, 2.0, 3.0]]) + .to_dtype::(); let r = a.leaky_trace().normalize::>(1e-5); assert_close_to_literal!( r, @@ -93,7 +95,9 @@ mod tests { #[test] fn test_2d_normalize_axis_first() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, 0.0], [1.0, 2.0], [4.0, 5.0]]); + let a = dev + .tensor([[-2.0, 0.0], [1.0, 2.0], [4.0, 5.0]]) + .to_dtype::(); let r = a.leaky_trace().normalize::>(1e-5); assert_close_to_literal!( r, diff --git a/src/tensor_ops/pool2d/cuda_kernel.rs b/src/tensor_ops/pool2d/cuda_kernel.rs index b38d1f209..d750fdfb1 100644 --- a/src/tensor_ops/pool2d/cuda_kernel.rs +++ b/src/tensor_ops/pool2d/cuda_kernel.rs @@ -74,6 +74,25 @@ macro_rules! pool_impl { }; } +#[cfg(feature = "f16")] +pool_impl!( + AvgPool2DKernel, + "avg_pool2d_fwd_f16", + "avg_pool2d_bwd_f16" +); +#[cfg(feature = "f16")] +pool_impl!( + MaxPool2DKernel, + "max_pool2d_fwd_f16", + "max_pool2d_bwd_f16" +); +#[cfg(feature = "f16")] +pool_impl!( + MinPool2DKernel, + "min_pool2d_fwd_f16", + "min_pool2d_bwd_f16" +); + pool_impl!( AvgPool2DKernel, "avg_pool2d_fwd_f32", diff --git a/src/tensor_ops/pool2d/mod.rs b/src/tensor_ops/pool2d/mod.rs index bf98f9d84..666f23e65 100644 --- a/src/tensor_ops/pool2d/mod.rs +++ b/src/tensor_ops/pool2d/mod.rs @@ -201,7 +201,9 @@ mod tests { #[test] fn test_pool2d_3d_max2d_eq_grads() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[[1.0, 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]]); + let x = dev + .tensor([[[1.0, 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]]) + .to_dtype::(); let r = x.leaky_trace().max_pool2d::<2, 1, 0>(); assert_close_to_literal!(r, [[[1., 1., 1.2]]]); let g = r.sum().backward(); @@ -211,7 +213,9 @@ mod tests { #[test] fn test_pool2d_3d_min2d_eq_grads() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[[1., 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]]); + let x = dev + .tensor([[[1., 1., 0.5, 0.2], [0.2, 0.2, 0.5, 1.2]]]) + .to_dtype::(); let r = x.leaky_trace().min_pool2d::<2, 1, 0>(); assert_close_to_literal!(r, [[[0.2, 0.2, 0.2]]]); let g = r.sum().backward(); diff --git a/src/tensor_ops/pool2d/pool2d.cu b/src/tensor_ops/pool2d/pool2d.cu index 8550c2b6f..a39d86d11 100644 --- a/src/tensor_ops/pool2d/pool2d.cu +++ b/src/tensor_ops/pool2d/pool2d.cu @@ -53,7 +53,9 @@ __device__ void avg_pool2d_fwd( } } - tmp /= static_cast(op.kernel * op.kernel); + double num_f64 = op.kernel * op.kernel; + T num = num_f64; + tmp /= num; out[i] = tmp; } @@ -105,7 +107,9 @@ __device__ void avg_pool2d_bwd( } } - grad_inp[i] += tmp / static_cast(op.kernel * op.kernel); + double num_f64 = op.kernel * op.kernel; + T num = num_f64; + grad_inp[i] += tmp / num; } template @@ -330,6 +334,22 @@ extern "C" __global__ void bwd( \ bwd_FN(op, inp_strides, out_strides, inp, grad_inp, out, grad_out); \ } +POOL_OP( + __half, + avg_pool2d_fwd_f16, avg_pool2d_bwd_f16, + avg_pool2d_fwd, avg_pool2d_bwd +); +POOL_OP( + __half, + min_pool2d_fwd_f16, min_pool2d_bwd_f16, + min_pool2d_fwd, min_pool2d_bwd +); +POOL_OP( + __half, + max_pool2d_fwd_f16, max_pool2d_bwd_f16, + max_pool2d_fwd, max_pool2d_bwd +); + POOL_OP( float, avg_pool2d_fwd_f32, avg_pool2d_bwd_f32, diff --git a/src/tensor_ops/pow/cuda_kernel.rs b/src/tensor_ops/pow/cuda_kernel.rs index 918d90f79..792a3bf04 100644 --- a/src/tensor_ops/pow/cuda_kernel.rs +++ b/src/tensor_ops/pow/cuda_kernel.rs @@ -6,11 +6,21 @@ use crate::{ }; use std::borrow::Cow; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for super::PowfKernelOp {} unsafe impl cudarc::driver::DeviceRepr for super::PowfKernelOp {} unsafe impl cudarc::driver::DeviceRepr for super::PowfKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/pow.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + PowfKernelOp, + half::f16, + PTX, + "pow_fwd_f16", + "pow_bwd_f16" +); cuda_unary!(PowfKernelOp, f32, PTX, "pow_fwd_f32", "pow_bwd_f32"); cuda_unary!(PowfKernelOp, f64, PTX, "pow_fwd_f64", "pow_bwd_f64"); diff --git a/src/tensor_ops/pow/mod.rs b/src/tensor_ops/pow/mod.rs index 6cdc24580..ecef22110 100644 --- a/src/tensor_ops/pow/mod.rs +++ b/src/tensor_ops/pow/mod.rs @@ -23,19 +23,19 @@ pub struct PowfKernelOp(E); /// ``` pub fn powf, E>, T: Tape>( t: Tensor, - exponent: impl Into, + exponent: impl Into, ) -> Tensor { t.powf(exponent) } impl, E>, T: Tape> Tensor { /// See [powf] - pub fn powf(self, exponent: impl Into) -> Self { + pub fn powf(self, exponent: impl Into) -> Self { self.try_powf(exponent).unwrap() } /// See [powf] - pub fn try_powf(self, exponent: impl Into) -> Result { - let exponent = exponent.into(); + pub fn try_powf(self, exponent: impl Into) -> Result { + let exponent = E::from_f64(exponent.into()).unwrap(); try_unary_op(PowfKernelOp(exponent), self) } } @@ -72,49 +72,55 @@ mod tests { #[test] fn test_powf_positive() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let t = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = t.leaky_trace().powf(3.5); let r_array = r.array(); assert!(r_array[0].is_nan()); assert!(r_array[1].is_nan()); - assert_close!(r_array[2], 0.0); - assert_close!(r_array[3], 1.0); - assert_close!(r_array[4], 11.313708); + assert_close!(r_array[2], NumCast::from(0.0).unwrap()); + assert_close!(r_array[3], NumCast::from(1.0).unwrap()); + assert_close!(r_array[4], NumCast::from(11.313708).unwrap()); let g = r.sum().backward(); let grad = g.get(&t).array(); assert!(grad[0].is_nan()); assert!(grad[1].is_nan()); - assert_close!(grad[2], 0.0); - assert_close!(grad[3], 3.5); - assert_close!(grad[4], 19.79899); + assert_close!(grad[2], NumCast::from(0.0).unwrap()); + assert_close!(grad[3], NumCast::from(3.5).unwrap()); + assert_close!(grad[4], NumCast::from(19.79899).unwrap()); } #[test] fn test_powf_negative() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let t = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = t.leaky_trace().powf(-1.2); let r_array = r.array(); assert!(r_array[0].is_nan()); assert!(r_array[1].is_nan()); assert_close!(r_array[2], TestDtype::INFINITY); - assert_close!(r_array[3], 1.0); - assert_close!(r_array[4], 0.43527526); + assert_close!(r_array[3], NumCast::from(1.0).unwrap()); + assert_close!(r_array[4], NumCast::from(0.43527526).unwrap()); let g = r.sum().backward(); let grad = g.get(&t).array(); assert!(grad[0].is_nan()); assert!(grad[1].is_nan()); assert_close!(grad[2], TestDtype::NEG_INFINITY); - assert_close!(grad[3], -1.2); - assert_close!(grad[4], -0.26116517); + assert_close!(grad[3], NumCast::from(-1.2).unwrap()); + assert_close!(grad[4], NumCast::from(-0.26116517).unwrap()); } #[test] fn test_powi_positive() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let t = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = t.leaky_trace().powi(3); assert_close_to_literal!(r, [-8., -1., 0., 1., 8.]); let g = r.sum().backward(); @@ -124,7 +130,9 @@ mod tests { #[test] fn test_powi_negative() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let t = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = t.leaky_trace().powi(-3); assert_close_to_literal!(r, [-0.125, -1.0, f64::INFINITY, 1.0, 0.125]); let g = r.sum().backward(); diff --git a/src/tensor_ops/pow/pow.cu b/src/tensor_ops/pow/pow.cu index ef0be3c79..2c73035e1 100644 --- a/src/tensor_ops/pow/pow.cu +++ b/src/tensor_ops/pow/pow.cu @@ -5,11 +5,20 @@ struct PowFKernelOp { F rhs; }; +template +__device__ T pow_bwd(PowFKernelOp op, T x) { + T one = 1.0; + return op.rhs * powg(x, op.rhs - one); +} + +UNARY_OP(__half, pow_fwd_f16, pow_bwd_f16, PowFKernelOp<__half>, + powg(x, op.rhs), + pow_bwd(op, x)) + UNARY_OP(float, pow_fwd_f32, pow_bwd_f32, PowFKernelOp, - powf(x, op.rhs), - op.rhs * powf(x, op.rhs - 1.0)) + powg(x, op.rhs), + pow_bwd(op, x)) UNARY_OP(double, pow_fwd_f64, pow_bwd_f64, PowFKernelOp, - pow(x, op.rhs), - op.rhs * pow(x, op.rhs - 1.0)) - \ No newline at end of file + powg(x, op.rhs), + pow_bwd(op, x)) diff --git a/src/tensor_ops/prelu.rs b/src/tensor_ops/prelu.rs index 1b0e4f891..485b9766b 100644 --- a/src/tensor_ops/prelu.rs +++ b/src/tensor_ops/prelu.rs @@ -91,8 +91,12 @@ mod tests { #[test] fn test_prelu() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); - let y: Tensor<_, TestDtype, _> = dev.tensor([0.05, 0.05, 0.05, 0.05, 0.05]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); + let y = dev + .tensor([0.05, 0.05, 0.05, 0.05, 0.05]) + .to_dtype::(); let r = x.leaky_trace().prelu(y.clone()); assert_close_to_literal!(r, [-0.1, -0.05, 0.0, 1.0, 2.0]); // NOTE: call .exp() to make sure we cover cases where .prelu() uses the result's gradient diff --git a/src/tensor_ops/recip/cuda_kernel.rs b/src/tensor_ops/recip/cuda_kernel.rs index 66b539b2f..145fc0eae 100644 --- a/src/tensor_ops/recip/cuda_kernel.rs +++ b/src/tensor_ops/recip/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for RecipKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/recip.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(df(f(x)) RecipKernelOp, half::f16, PTX, "recip_fwd_f16", "recip_bwd_f16"); cuda_unary!(df(f(x)) RecipKernelOp, f32, PTX, "recip_fwd_f32", "recip_bwd_f32"); cuda_unary!(df(f(x)) RecipKernelOp, f64, PTX, "recip_fwd_f64", "recip_bwd_f64"); diff --git a/src/tensor_ops/recip/mod.rs b/src/tensor_ops/recip/mod.rs index fa738b3a5..78eb28792 100644 --- a/src/tensor_ops/recip/mod.rs +++ b/src/tensor_ops/recip/mod.rs @@ -43,7 +43,9 @@ mod tests { #[test] fn test_recip() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().recip(); assert_close_to_literal!(r, [-0.5, -1.0, f64::INFINITY, 1.0, 0.5]); let g = r.mean().backward(); diff --git a/src/tensor_ops/recip/recip.cu b/src/tensor_ops/recip/recip.cu index c40fa7871..3f1b3bb99 100644 --- a/src/tensor_ops/recip/recip.cu +++ b/src/tensor_ops/recip/recip.cu @@ -2,14 +2,20 @@ struct RecipKernelOp {}; +UNARY_OP( + __half, recip_fwd_f16, recip_bwd_f16, RecipKernelOp, + recipg(x), + -y * y +) + UNARY_OP( float, recip_fwd_f32, recip_bwd_f32, RecipKernelOp, - 1 / x, + recipg(x), -y * y ) UNARY_OP( double, recip_fwd_f64, recip_bwd_f64, RecipKernelOp, - 1 / x, + recipg(x), -y * y ) diff --git a/src/tensor_ops/relu/cuda_kernel.rs b/src/tensor_ops/relu/cuda_kernel.rs index f4d00f633..13a6fc80c 100644 --- a/src/tensor_ops/relu/cuda_kernel.rs +++ b/src/tensor_ops/relu/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for ReLUKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/relu.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(ReLUKernelOp, half::f16, PTX, "relu_fwd_f16", "relu_bwd_f16"); cuda_unary!(ReLUKernelOp, f32, PTX, "relu_fwd_f32", "relu_bwd_f32"); cuda_unary!(ReLUKernelOp, f64, PTX, "relu_fwd_f64", "relu_bwd_f64"); diff --git a/src/tensor_ops/relu/mod.rs b/src/tensor_ops/relu/mod.rs index cc7952a39..c1ac134db 100644 --- a/src/tensor_ops/relu/mod.rs +++ b/src/tensor_ops/relu/mod.rs @@ -46,7 +46,9 @@ mod tests { #[test] fn test_relu() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().relu(); assert_close_to_literal!(r, [0.0, 0.0, 0.0, 1.0, 2.0]); // NOTE: call .exp() to make sure we cover cases where .relu() uses the result's gradient diff --git a/src/tensor_ops/relu/relu.cu b/src/tensor_ops/relu/relu.cu index 079eeda3d..33dcbd20e 100644 --- a/src/tensor_ops/relu/relu.cu +++ b/src/tensor_ops/relu/relu.cu @@ -2,11 +2,27 @@ struct ReLUKernelOp {}; +template +__device__ __forceinline__ T relu_fwd(T x) { + T zero = 0.0; + return maxg(x, zero); +} + +template +__device__ __forceinline__ T relu_bwd(T x) { + T zero = 0.0; + T one = 1.0; + return x > zero ? one : zero; +} + +UNARY_OP(__half, relu_fwd_f16, relu_bwd_f16, ReLUKernelOp, + relu_fwd(x), + relu_bwd(x)) + UNARY_OP(float, relu_fwd_f32, relu_bwd_f32, ReLUKernelOp, - fmaxf(x, 0.0), - x > 0.0 ? 1.0 : 0.0) + relu_fwd(x), + relu_bwd(x)) UNARY_OP(double, relu_fwd_f64, relu_bwd_f64, ReLUKernelOp, - fmax(x, 0.0), - x > 0.0 ? 1.0 : 0.0) - \ No newline at end of file + relu_fwd(x), + relu_bwd(x)) diff --git a/src/tensor_ops/reshape_to/cuda_kernel.rs b/src/tensor_ops/reshape_to/cuda_kernel.rs index 53041bf2a..082234f39 100644 --- a/src/tensor_ops/reshape_to/cuda_kernel.rs +++ b/src/tensor_ops/reshape_to/cuda_kernel.rs @@ -21,6 +21,10 @@ impl super::ReshapeKernel for Cuda { let src = FWD_KERNEL.replace("$T", E::NAME); let opts = CompileOptions { arch: Some(env!("CUDA_COMPUTE_CAP")), + include_paths: vec![ + env!("CUDA_INCLUDE_DIR").to_string(), + env!("OUT_DIR").to_string(), + ], ..Default::default() }; let ptx = compile_ptx_with_opts(src, opts).unwrap(); @@ -64,6 +68,10 @@ impl super::ReshapeKernel for Cuda { let src = BWD_KERNEL.replace("$T", E::NAME); let opts = CompileOptions { arch: Some(env!("CUDA_COMPUTE_CAP")), + include_paths: vec![ + env!("CUDA_INCLUDE_DIR").to_string(), + env!("OUT_DIR").to_string(), + ], ..Default::default() }; let ptx = compile_ptx_with_opts(src, opts).unwrap(); @@ -101,20 +109,7 @@ typedef long int intptr_t; typedef int intptr_t; #endif -__device__ unsigned int get_strided_index( - unsigned int idx, - const size_t num_dims, - const size_t *dims, - const size_t *strides -) { - unsigned int strided_i = 0; - for (unsigned int d = 0; d < num_dims; d++) { - unsigned int dim_idx = num_dims - 1 - d; - strided_i += (idx % dims[dim_idx]) * strides[dim_idx]; - idx /= dims[dim_idx]; - } - return strided_i; -} +#include \"cuda_utils.cuh\" extern \"C\" __global__ void reshape_fwd( const size_t numel, @@ -148,20 +143,7 @@ typedef long int intptr_t; typedef int intptr_t; #endif -__device__ unsigned int get_strided_index( - unsigned int idx, - const size_t num_dims, - const size_t *dims, - const size_t *strides -) { - unsigned int strided_i = 0; - for (unsigned int d = 0; d < num_dims; d++) { - unsigned int dim_idx = num_dims - 1 - d; - strided_i += (idx % dims[dim_idx]) * strides[dim_idx]; - idx /= dims[dim_idx]; - } - return strided_i; -} +#include \"cuda_utils.cuh\" extern \"C\" __global__ void reshape_bwd( const size_t numel, diff --git a/src/tensor_ops/reshape_to/mod.rs b/src/tensor_ops/reshape_to/mod.rs index 15da6eb2f..d8e5e797d 100644 --- a/src/tensor_ops/reshape_to/mod.rs +++ b/src/tensor_ops/reshape_to/mod.rs @@ -179,7 +179,9 @@ mod tests { #[test] fn test_1d_reshape() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]); + let a = dev + .tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + .to_dtype::(); let b = a.leaky_trace().reshape::>(); assert_close_to_literal!(b, [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]); let g = b.exp().mean().backward(); @@ -192,7 +194,9 @@ mod tests { #[test] fn test_1d_reshape_non_contiguous() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]); + let a = dev + .tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) + .to_dtype::(); let b = a .leaky_trace() .permute::, _>() @@ -211,7 +215,8 @@ mod tests { #[test] fn test_reshape_broadcasted() { let dev: TestDevice = Default::default(); - let a: Tensor, TestDtype, _> = dev.tensor([1., 2., 3.]).broadcast(); + let a: Tensor, TestDtype, _> = + dev.tensor([1., 2., 3.]).to_dtype::().broadcast(); let b: Tensor, TestDtype, _> = a.clone().reshape(); #[cfg(feature = "cuda")] @@ -219,19 +224,21 @@ mod tests { assert_eq!(b.data.len(), 6); assert_eq!(a.as_vec(), b.as_vec()); - assert_eq!(b.array(), [[1., 2.], [3., 1.], [2., 3.]]); + assert_close_to_literal!(b, [[1., 2.], [3., 1.], [2., 3.]]); } #[test] fn test_contiguous() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]); + let a = dev + .tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) + .to_dtype::(); let b1 = a.clone().contiguous(); assert_eq!(a.strides, b1.strides); - let b2: Tensor<_, TestDtype, _> = a.permute::, _>().contiguous(); + let b2 = a.permute::, _>().contiguous(); assert_eq!(b2.strides, [2, 1]); } } diff --git a/src/tensor_ops/roll/cuda_kernel.rs b/src/tensor_ops/roll/cuda_kernel.rs index 92667463a..2592fa372 100644 --- a/src/tensor_ops/roll/cuda_kernel.rs +++ b/src/tensor_ops/roll/cuda_kernel.rs @@ -12,6 +12,10 @@ const PTX_SRC: &str = include_str!(concat!(env!("OUT_DIR"), "/roll.ptx")); trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const FNS: &'static [&'static str] = &["roll_fwd_f16", "roll_bwd_f16"]; +} impl HasCudaKernel for Cuda { const FNS: &'static [&'static str] = &["roll_fwd_f32", "roll_bwd_f32"]; } diff --git a/src/tensor_ops/roll/mod.rs b/src/tensor_ops/roll/mod.rs index 59ff4d5a1..67dd3bff5 100644 --- a/src/tensor_ops/roll/mod.rs +++ b/src/tensor_ops/roll/mod.rs @@ -93,7 +93,9 @@ mod tests { #[test] fn test_roll_3d_axis_2() { let dev: TestDevice = Default::default(); - let t: Tensor, TestDtype, _> = dev.tensor([-0.3, -0.15, 0.0, 0.15, 0.2]); + let t = dev + .tensor([-0.3, -0.15, 0.0, 0.15, 0.2]) + .to_dtype::(); let y = t .leaky_trace() .broadcast::, _>() @@ -109,7 +111,9 @@ mod tests { #[test] fn test_roll_3d_first_two_axes() { let dev: TestDevice = Default::default(); - let t: Tensor, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0, 4.0, 5.0]); + let t = dev + .tensor([1.0, 2.0, 3.0, 4.0, 5.0]) + .to_dtype::(); let y0 = t .leaky_trace() .broadcast::, _>() diff --git a/src/tensor_ops/roll/roll.cu b/src/tensor_ops/roll/roll.cu index af810ed75..375e73b32 100644 --- a/src/tensor_ops/roll/roll.cu +++ b/src/tensor_ops/roll/roll.cu @@ -105,5 +105,6 @@ extern "C" __global__ void BWD( \ const TY *grad_out \ ) { roll_bwd(op, num_dims, numel, dims, inp_strides, out_strides, grad_inp, grad_out); } +ROLL(__half, roll_fwd_f16, roll_bwd_f16); ROLL(float, roll_fwd_f32, roll_bwd_f32); ROLL(double, roll_fwd_f64, roll_bwd_f64); diff --git a/src/tensor_ops/select_and_gather/cuda_kernel.rs b/src/tensor_ops/select_and_gather/cuda_kernel.rs index 5f1997644..0243a5ef9 100644 --- a/src/tensor_ops/select_and_gather/cuda_kernel.rs +++ b/src/tensor_ops/select_and_gather/cuda_kernel.rs @@ -187,6 +187,17 @@ macro_rules! impl_cuda_kernels { }; } +#[cfg(feature = "f16")] +impl_cuda_kernels!( + half::f16, + "gather_f16", + "gather_fwd_f16", + "gather_bwd_f16", + "select_f16", + "select_fwd_f16", + "select_bwd_f16" +); + impl_cuda_kernels!( f32, "gather_f32", diff --git a/src/tensor_ops/select_and_gather/gather.cu b/src/tensor_ops/select_and_gather/gather.cu index 1e9b369d3..747c033a4 100644 --- a/src/tensor_ops/select_and_gather/gather.cu +++ b/src/tensor_ops/select_and_gather/gather.cu @@ -127,5 +127,6 @@ extern "C" __global__ void BWD( \ gather_bwd(numel, grad_inp, inp_num_dims, inp_dims, inp_strides, idx, idx_num_dims, idx_dims, idx_strides, grad_out, out_num_dims); \ } +GATHER(__half, gather_fwd_f16, gather_bwd_f16); GATHER(float, gather_fwd_f32, gather_bwd_f32); GATHER(double, gather_fwd_f64, gather_bwd_f64); diff --git a/src/tensor_ops/select_and_gather/mod.rs b/src/tensor_ops/select_and_gather/mod.rs index 104a47ccb..fe7d0b41b 100644 --- a/src/tensor_ops/select_and_gather/mod.rs +++ b/src/tensor_ops/select_and_gather/mod.rs @@ -194,8 +194,7 @@ impl, T: Tape> GatherTo #[cfg(test)] mod tests { use super::*; - use crate::tensor_ops::*; - use crate::tests::*; + use crate::{tensor_ops::*, tests::*}; #[test] #[should_panic = "dimension 0 not the same"] @@ -269,7 +268,9 @@ mod tests { let t_array = t.array(); assert_eq!(r.array(), t_array[0]); let g = r.exp().backward(); - assert_eq!(g.get(&t).array(), [t_array[0].exp(), 0.0, 0.0, 0.0, 0.0]); + let mut expected = [TestDtype::zero(); 5]; + expected[0] = t_array[0].exp(); + assert_eq!(g.get(&t).array(), expected); } #[test] @@ -284,10 +285,10 @@ mod tests { g.get(&t).array(), [ t_array[0].exp(), - 2.0 * (t_array[1]).exp(), - 0.0, + t_array[1].exp() + t_array[1].exp(), + TestDtype::zero(), t_array[3].exp(), - 0.0 + TestDtype::zero() ] ); } @@ -323,7 +324,9 @@ mod tests { #[test] fn test_select_2d_axis_0() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]]) + .to_dtype::(); let r = t.leaky_trace().select(dev.tensor(0)); assert_close_to_literal!(r, [1.0, 2.0, 3.0]); let g = r.mean().backward(); @@ -333,7 +336,9 @@ mod tests { #[test] fn test_select_2d_axis_1() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [-1.0, -2.0, -3.0]]) + .to_dtype::(); let r = t.leaky_trace().select(dev.tensor([1, 1])); assert_close_to_literal!(r, [2.0, -2.0]); let g = r.mean().backward(); @@ -343,7 +348,7 @@ mod tests { #[test] fn test_select_2d_broadcasted() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); + let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); let r = t .leaky_trace() .broadcast::, _>() @@ -356,7 +361,7 @@ mod tests { #[test] fn test_gather_2d_broadcasted() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); + let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); let idx: Tensor, usize, _> = dev.tensor([[0, 1], [1, 2]]); let r: Tensor, _, _, _> = t.leaky_trace().broadcast::, _>().gather(idx); @@ -391,12 +396,10 @@ mod tests { let g = r.exp().mean().backward(); let sub_g = dev.tensor(sub_t).exp() / 8.0; let sub_g = sub_g.array(); + let z = TestDtype::zero(); assert_close!( g.get(&t).array(), - [ - [[0.0; 4], sub_g[0], [0.0; 4]], - [[0.0; 4], [0.0; 4], sub_g[1]], - ] + [[[z; 4], sub_g[0], [z; 4]], [[z; 4], [z; 4], sub_g[1]],] ); } @@ -414,18 +417,19 @@ mod tests { let g = r.exp().mean().backward(); let sub_g = dev.tensor(sub_t).exp() / 6.0; let sub_g = sub_g.array(); + let z = TestDtype::zero(); assert_close!( g.get(&t).array(), [ [ - [0.0, 0.0, sub_g[0][0], 0.0], - [0.0, 0.0, 0.0, sub_g[0][1]], - [0.0, 0.0, sub_g[0][2], 0.0], + [z, z, sub_g[0][0], z], + [z, z, z, sub_g[0][1]], + [z, z, sub_g[0][2], z], ], [ - [0.0, sub_g[1][0], 0.0, 0.0], - [0.0, sub_g[1][1], 0.0, 0.0], - [sub_g[1][2], 0.0, 0.0, 0.0], + [z, sub_g[1][0], z, z], + [z, sub_g[1][1], z, z], + [sub_g[1][2], z, z, z], ], ] ); diff --git a/src/tensor_ops/select_and_gather/select.cu b/src/tensor_ops/select_and_gather/select.cu index 3f790e88a..21242c0d6 100644 --- a/src/tensor_ops/select_and_gather/select.cu +++ b/src/tensor_ops/select_and_gather/select.cu @@ -117,5 +117,6 @@ extern "C" __global__ void BWD( \ select_bwd(numel, grad_inp, inp_num_dims, inp_dims, inp_strides, idx, idx_num_dims, idx_dims, idx_strides, grad_out, out_dims, out_strides); \ } +SELECT(__half, select_fwd_f16, select_bwd_f16); SELECT(float, select_fwd_f32, select_bwd_f32); SELECT(double, select_fwd_f64, select_bwd_f64) diff --git a/src/tensor_ops/sigmoid/cuda_kernel.rs b/src/tensor_ops/sigmoid/cuda_kernel.rs index b379f47df..6d3b55110 100644 --- a/src/tensor_ops/sigmoid/cuda_kernel.rs +++ b/src/tensor_ops/sigmoid/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for Sigmoid {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/sigmoid.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(df(f(x)) Sigmoid, half::f16, PTX, "sigmoid_fwd_f16", "sigmoid_bwd_f16"); cuda_unary!(df(f(x)) Sigmoid, f32, PTX, "sigmoid_fwd_f32", "sigmoid_bwd_f32"); cuda_unary!(df(f(x)) Sigmoid, f64, PTX, "sigmoid_fwd_f64", "sigmoid_bwd_f64"); diff --git a/src/tensor_ops/sigmoid/mod.rs b/src/tensor_ops/sigmoid/mod.rs index d789c8afe..6e3a0661c 100644 --- a/src/tensor_ops/sigmoid/mod.rs +++ b/src/tensor_ops/sigmoid/mod.rs @@ -45,7 +45,9 @@ mod tests { #[test] fn test_sigmoid() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().sigmoid(); assert_close_to_literal!(r, [0.11920292, 0.26894143, 0.5, 0.7310586, 0.880797]); let g = r.mean().backward(); diff --git a/src/tensor_ops/sigmoid/sigmoid.cu b/src/tensor_ops/sigmoid/sigmoid.cu index d867b7773..0f9a8882b 100644 --- a/src/tensor_ops/sigmoid/sigmoid.cu +++ b/src/tensor_ops/sigmoid/sigmoid.cu @@ -1,15 +1,28 @@ #include "unary_op_macros.cuh" -#define SIGMOID_f32(X) (1.0 / (1.0 + expf(-X))) -#define SIGMOID_f64(X) (1.0 / (1.0 + exp(-X))) - struct SigmoidKernelOp {}; +template +__device__ __forceinline__ T sigmoid_fwd(T x) { + T one = 1.0; + return one / (one + expg(-x)); +} + +template +__device__ __forceinline__ T sigmoid_bwd(T y) { + T one = 1.0; + return y * (one - y); +} + +UNARY_OP(__half, sigmoid_fwd_f16, sigmoid_bwd_f16, SigmoidKernelOp, + sigmoid_fwd(x), + sigmoid_bwd(y)) + UNARY_OP(float, sigmoid_fwd_f32, sigmoid_bwd_f32, SigmoidKernelOp, - SIGMOID_f32(x), - y * (1.0 - y)) + sigmoid_fwd(x), + sigmoid_bwd(y)) UNARY_OP(double, sigmoid_fwd_f64, sigmoid_bwd_f64, SigmoidKernelOp, - SIGMOID_f64(x), - y * (1.0 - y)) + sigmoid_fwd(x), + sigmoid_bwd(y)) \ No newline at end of file diff --git a/src/tensor_ops/sin/cuda_kernel.rs b/src/tensor_ops/sin/cuda_kernel.rs index 97af74db2..9fd33010c 100644 --- a/src/tensor_ops/sin/cuda_kernel.rs +++ b/src/tensor_ops/sin/cuda_kernel.rs @@ -4,5 +4,13 @@ unsafe impl cudarc::driver::DeviceRepr for super::SinKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/sin.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + super::SinKernelOp, + half::f16, + PTX, + "sin_fwd_f16", + "sin_bwd_f16" +); cuda_unary!(super::SinKernelOp, f32, PTX, "sin_fwd_f32", "sin_bwd_f32"); cuda_unary!(super::SinKernelOp, f64, PTX, "sin_fwd_f64", "sin_bwd_f64"); diff --git a/src/tensor_ops/sin/mod.rs b/src/tensor_ops/sin/mod.rs index 2fa7d9334..5a3fe2f05 100644 --- a/src/tensor_ops/sin/mod.rs +++ b/src/tensor_ops/sin/mod.rs @@ -46,7 +46,9 @@ mod tests { #[test] fn test_sin() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().sin(); assert_close_to_literal!(r, [-0.9092974, -0.84147096, 0.0, 0.84147096, 0.9092974]); let g = r.mean().backward(); diff --git a/src/tensor_ops/sin/sin.cu b/src/tensor_ops/sin/sin.cu index 1110c6884..168fc85b8 100644 --- a/src/tensor_ops/sin/sin.cu +++ b/src/tensor_ops/sin/sin.cu @@ -2,11 +2,15 @@ struct SinKernelOp {}; +UNARY_OP(__half, sin_fwd_f16, sin_bwd_f16, SinKernelOp, + sing(x), + cosg(x)) + UNARY_OP(float, sin_fwd_f32, sin_bwd_f32, SinKernelOp, - sinf(x), - cosf(x)) + sing(x), + cosg(x)) UNARY_OP(double, sin_fwd_f64, sin_bwd_f64, SinKernelOp, - sin(x), - cos(x)) + sing(x), + cosg(x)) \ No newline at end of file diff --git a/src/tensor_ops/slice/cuda_kernel.rs b/src/tensor_ops/slice/cuda_kernel.rs index 88db9284f..58f9a681e 100644 --- a/src/tensor_ops/slice/cuda_kernel.rs +++ b/src/tensor_ops/slice/cuda_kernel.rs @@ -25,6 +25,12 @@ macro_rules! has_kernels { has_kernels!(u8, u16, u32, u64, usize, i8, i16, i32, i64, isize, bool); +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "slice_f16"; + const FNS: &'static [&'static str] = &["slice_fwd_f16", "slice_bwd_f16"]; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "slice_f32"; const FNS: &'static [&'static str] = &["slice_fwd_f32", "slice_bwd_f32"]; diff --git a/src/tensor_ops/slice/mod.rs b/src/tensor_ops/slice/mod.rs index 09adf7190..b5b172a2c 100644 --- a/src/tensor_ops/slice/mod.rs +++ b/src/tensor_ops/slice/mod.rs @@ -89,12 +89,14 @@ mod tests { #[test] fn test_slice() { let dev = TestDevice::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([ - [1., 2., 3., 4.], - [5., 6., 7., 8.], - [9., 10., 11., 12.], - [13., 14., 15., 16.], - ]); + let a = dev + .tensor([ + [1., 2., 3., 4.], + [5., 6., 7., 8.], + [9., 10., 11., 12.], + [13., 14., 15., 16.], + ]) + .to_dtype::(); let b: Tensor, _, _> = a.clone().slice((2.., 2..)).realize().unwrap(); assert_close_to_literal!(b, [[11., 12.], [15., 16.]]); @@ -124,7 +126,10 @@ mod tests { #[test] fn test_slice_broadcast_top() { let dev = TestDevice::default(); - let a: Tensor, TestDtype, _> = dev.tensor([1., 2., 3., 4.]).broadcast(); + let a = dev + .tensor([1., 2., 3., 4.]) + .to_dtype::() + .broadcast::, _>(); let b: Tensor, _, _> = a.clone().slice((..3, ..)).realize().unwrap(); assert_close_to_literal!(b, [[1., 2., 3., 4.]; 3]); @@ -142,7 +147,10 @@ mod tests { #[test] fn test_slice_broadcast_bottom() { let dev = TestDevice::default(); - let a: Tensor, TestDtype, _> = dev.tensor([1., 2., 3., 4.]).broadcast(); + let a: Tensor, TestDtype, _> = dev + .tensor([1., 2., 3., 4.]) + .to_dtype::() + .broadcast(); let b: Tensor, _, _> = a.clone().slice((1..3, ..)).realize().unwrap(); assert_close_to_literal!(b, [[2.; 5], [3.; 5]]); @@ -160,12 +168,14 @@ mod tests { #[test] fn test_slice_backward() { let dev = TestDevice::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([ - [1., 2., 3., 4.], - [5., 6., 7., 8.], - [9., 10., 11., 12.], - [13., 14., 15., 16.], - ]); + let a = dev + .tensor([ + [1., 2., 3., 4.], + [5., 6., 7., 8.], + [9., 10., 11., 12.], + [13., 14., 15., 16.], + ]) + .to_dtype::(); let b: Tensor, _, _, _> = a.leaky_trace().slice((2.., 2..)).realize().unwrap(); assert_close_to_literal!(b, [[11., 12.], [15., 16.]]); diff --git a/src/tensor_ops/slice/slice.cu b/src/tensor_ops/slice/slice.cu index 5a520cc25..cb4f8325a 100644 --- a/src/tensor_ops/slice/slice.cu +++ b/src/tensor_ops/slice/slice.cu @@ -68,6 +68,7 @@ extern "C" __global__ void BWD( \ slice_bwd(numel, num_dims, dims, strides, offset, grad_inp, grad_out); \ } +SLICE(__half, slice_fwd_f16, slice_bwd_f16); SLICE(float, slice_fwd_f32, slice_bwd_f32); SLICE(double, slice_fwd_f64, slice_bwd_f64); SLICE_FWD(uint8_t, slice_fwd_u8); diff --git a/src/tensor_ops/softmax.rs b/src/tensor_ops/softmax.rs index 125f5a0d4..18cf28721 100644 --- a/src/tensor_ops/softmax.rs +++ b/src/tensor_ops/softmax.rs @@ -113,13 +113,17 @@ mod tests { #[test] fn test_softmax_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let a = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = a.leaky_trace().softmax(); assert_close_to_literal!( r, [0.011656232, 0.031684924, 0.086128555, 0.23412168, 0.6364087] ); - let l = r * dev.tensor([0.0, 0.0, 1.0, 0.0, 0.0]); + let l = r * dev + .tensor([0.0, 0.0, 1.0, 0.0, 0.0]) + .to_dtype::(); assert_close_to_literal!(l, [0.0, 0.0, 0.086128555, 0.0, 0.0]); let g = l.mean().backward(); assert_close_to_literal!( @@ -137,7 +141,9 @@ mod tests { #[test] fn test_softmax_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]); + let a = dev + .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]) + .to_dtype::(); let r = a.leaky_trace().softmax::>(); assert_close_to_literal!( r, @@ -146,7 +152,9 @@ mod tests { [0.002355633, 0.047314156, 0.9503302], ] ); - let l = r * dev.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]); + let l = r * dev + .tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + .to_dtype::(); assert_close_to_literal!(l, [[0.09003058, 0.0, 0.0], [0.0, 0.047314156, 0.0]]); let g = l.mean().backward(); assert_close_to_literal!( @@ -161,7 +169,9 @@ mod tests { #[test] fn test_softmax_2d_0th_axis() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]); + let a = dev + .tensor([[-2.0, -1.0, 0.0], [1.0, 4.0, 7.0]]) + .to_dtype::(); let r = a.leaky_trace().softmax::>(); assert_close_to_literal!( r, @@ -170,7 +180,9 @@ mod tests { [0.95257413, 0.9933072, 0.9990892], ] ); - let l = r * dev.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]); + let l = r * dev + .tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + .to_dtype::(); assert_close_to_literal!(l, [[0.047425874, 0.0, 0.0], [0.0, 0.9933072, 0.0]]); let g = l.mean().backward(); assert_close_to_literal!( diff --git a/src/tensor_ops/sqrt/cuda_kernel.rs b/src/tensor_ops/sqrt/cuda_kernel.rs index 9990a67b4..6bd0ea39c 100644 --- a/src/tensor_ops/sqrt/cuda_kernel.rs +++ b/src/tensor_ops/sqrt/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for SqrtKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/sqrt.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(df(f(x)) SqrtKernelOp, half::f16, PTX, "sqrt_fwd_f16", "sqrt_bwd_f16"); cuda_unary!(df(f(x)) SqrtKernelOp, f32, PTX, "sqrt_fwd_f32", "sqrt_bwd_f32"); cuda_unary!(df(f(x)) SqrtKernelOp, f64, PTX, "sqrt_fwd_f64", "sqrt_bwd_f64"); diff --git a/src/tensor_ops/sqrt/mod.rs b/src/tensor_ops/sqrt/mod.rs index 254b89885..bbe03cb9a 100644 --- a/src/tensor_ops/sqrt/mod.rs +++ b/src/tensor_ops/sqrt/mod.rs @@ -45,13 +45,24 @@ mod tests { #[test] fn test_sqrt() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-1.0, 0.0, 1.0, 4.0]); + let x = dev.tensor([-1.0, 0.0, 1.0, 4.0]).to_dtype::(); let r = x.leaky_trace().sqrt(); - assert!(r.array()[0].is_nan()); - assert_eq!(r.array()[1..], [0.0, 1.0, 2.0]); + let r_array = r.array(); + assert!(r_array[0].is_nan()); + assert_eq!( + &r_array[1..], + [0.0, 1.0, 2.0] + .map(NumCast::from) + .map(Option::::unwrap) + ); let g = r.mean().backward(); let g = g.get(&x).array(); assert!(g[0].is_nan()); - assert_eq!(g[1..], [TestDtype::INFINITY, 0.5 / 4.0, 0.25 / 4.0]); + assert_eq!( + &g[1..], + [f64::INFINITY, 0.5 / 4.0, 0.25 / 4.0] + .map(NumCast::from) + .map(Option::::unwrap) + ); } } diff --git a/src/tensor_ops/sqrt/sqrt.cu b/src/tensor_ops/sqrt/sqrt.cu index f3db1c0fe..21e87ac5f 100644 --- a/src/tensor_ops/sqrt/sqrt.cu +++ b/src/tensor_ops/sqrt/sqrt.cu @@ -2,11 +2,15 @@ struct SqrtKernelOp {}; +UNARY_OP(__half, sqrt_fwd_f16, sqrt_bwd_f16, SqrtKernelOp, + sqrtg(x), + recipg(y + y)) + UNARY_OP(float, sqrt_fwd_f32, sqrt_bwd_f32, SqrtKernelOp, - sqrtf(x), - 1 / (y + y)) + sqrtg(x), + recipg(y + y)) UNARY_OP(double, sqrt_fwd_f64, sqrt_bwd_f64, SqrtKernelOp, - sqrt(x), - 1 / (y + y)) + sqrtg(x), + recipg(y + y)) \ No newline at end of file diff --git a/src/tensor_ops/square/cuda_kernel.rs b/src/tensor_ops/square/cuda_kernel.rs index b85cef0cc..4f8a887a3 100644 --- a/src/tensor_ops/square/cuda_kernel.rs +++ b/src/tensor_ops/square/cuda_kernel.rs @@ -5,5 +5,13 @@ unsafe impl cudarc::driver::DeviceRepr for SquareKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/square.ptx")); +#[cfg(feature = "f16")] +cuda_unary!( + SquareKernelOp, + half::f16, + PTX, + "square_fwd_f16", + "square_bwd_f16" +); cuda_unary!(SquareKernelOp, f32, PTX, "square_fwd_f32", "square_bwd_f32"); cuda_unary!(SquareKernelOp, f64, PTX, "square_fwd_f64", "square_bwd_f64"); diff --git a/src/tensor_ops/square/mod.rs b/src/tensor_ops/square/mod.rs index 36838290d..d7361b12b 100644 --- a/src/tensor_ops/square/mod.rs +++ b/src/tensor_ops/square/mod.rs @@ -45,7 +45,9 @@ mod tests { #[test] fn test_square() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().square(); assert_close_to_literal!(r, [4.0, 1.0, 0.0, 1.0, 4.0]); let g = r.mean().backward(); diff --git a/src/tensor_ops/square/square.cu b/src/tensor_ops/square/square.cu index a272451da..85318f107 100644 --- a/src/tensor_ops/square/square.cu +++ b/src/tensor_ops/square/square.cu @@ -2,11 +2,15 @@ struct SquareKernelOp {}; +UNARY_OP(__half, square_fwd_f16, square_bwd_f16, SquareKernelOp, + x * x, + x + x) + UNARY_OP(float, square_fwd_f32, square_bwd_f32, SquareKernelOp, x * x, - 2.0 * x) + x + x) UNARY_OP(double, square_fwd_f64, square_bwd_f64, SquareKernelOp, x * x, - 2.0 * x) + x + x) \ No newline at end of file diff --git a/src/tensor_ops/stack/cuda_kernel.rs b/src/tensor_ops/stack/cuda_kernel.rs index b3ecf10a9..a3508f502 100644 --- a/src/tensor_ops/stack/cuda_kernel.rs +++ b/src/tensor_ops/stack/cuda_kernel.rs @@ -60,6 +60,7 @@ impl super::StackKernel for Cuda { let src = BWD_KERNEL.replace("$Ty", E::NAME); let opts = CompileOptions { arch: Some(env!("CUDA_COMPUTE_CAP")), + include_paths: vec![env!("CUDA_INCLUDE_DIR").to_string()], ..Default::default() }; let ptx = compile_ptx_with_opts(src, opts).unwrap(); @@ -81,6 +82,7 @@ impl super::StackKernel for Cuda { } const BWD_KERNEL: &str = " +#include \"cuda_fp16.h\" extern \"C\" __global__ void stack_bwd(const size_t numel, const $Ty *inp, $Ty *out) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < numel) { out[i] += inp[i]; } diff --git a/src/tensor_ops/stddev_to.rs b/src/tensor_ops/stddev_to.rs index 5b7c03da1..5c2811c64 100644 --- a/src/tensor_ops/stddev_to.rs +++ b/src/tensor_ops/stddev_to.rs @@ -15,7 +15,7 @@ pub trait StddevTo: HasErr + HasShape { /// let r = t.stddev::, _>(0.0); // or `stddev::<_, Axis<1>>(0.0)` /// assert_eq!(r.array(), [0.6666667_f32.sqrt(), 6.0_f32.sqrt()]); /// ``` - fn stddev(self, epsilon: impl Into) -> Self::WithShape + fn stddev(self, epsilon: impl Into) -> Self::WithShape where Self::Shape: HasAxes + ReduceShapeTo, { @@ -24,7 +24,7 @@ pub trait StddevTo: HasErr + HasShape { /// Fallible version of [StddevTo::stddev] fn try_stddev( self, - epsilon: impl Into, + epsilon: impl Into, ) -> Result, Self::Err> where Self::Shape: HasAxes + ReduceShapeTo; @@ -33,12 +33,14 @@ pub trait StddevTo: HasErr + HasShape { impl, T: Tape> StddevTo for Tensor { fn try_stddev( self, - epsilon: impl Into, + epsilon: impl Into, ) -> Result, Self::Err> where Self::Shape: HasAxes + ReduceShapeTo, { - self.try_var()?.try_add(epsilon.into())?.try_sqrt() + self.try_var()? + .try_add(E::from_f64(epsilon.into()).unwrap())? + .try_sqrt() } } @@ -50,7 +52,9 @@ mod tests { #[test] fn test_std_axis_0_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]) + .to_dtype::(); let r = t.leaky_trace().stddev::, _>(1e-8); assert_close_to_literal!(r, [0.5, 0.0001, 1.0, 3.0]); let g = r.mean().backward(); @@ -63,7 +67,9 @@ mod tests { #[test] fn test_std_axis_1_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]) + .to_dtype::(); let r = t.leaky_trace().stddev::, _>(0.0); assert_close_to_literal!(r, [1.118034, 3.7666297]); let g = r.mean().backward(); diff --git a/src/tensor_ops/sub/binary_sub.cu b/src/tensor_ops/sub/binary_sub.cu index 2512bbd6d..44b93302d 100644 --- a/src/tensor_ops/sub/binary_sub.cu +++ b/src/tensor_ops/sub/binary_sub.cu @@ -2,6 +2,11 @@ struct BinarySubKernelOp {}; +BINARY_OP(__half, bsub_fwd_f16, bsub_bwd_lhs_f16, bsub_bwd_rhs_f16, BinarySubKernelOp, + x - y, + 1.0, + -1.0) + BINARY_OP(float, bsub_fwd_f32, bsub_bwd_lhs_f32, bsub_bwd_rhs_f32, BinarySubKernelOp, x - y, 1.0, diff --git a/src/tensor_ops/sub/cuda_kernel.rs b/src/tensor_ops/sub/cuda_kernel.rs index 033ab4150..60cb86ebd 100644 --- a/src/tensor_ops/sub/cuda_kernel.rs +++ b/src/tensor_ops/sub/cuda_kernel.rs @@ -1,6 +1,8 @@ use super::{BinarySubKernelOp as Binary, ScalarSubKernelOp as Scalar}; use crate::tensor_ops::cuda_kernels::{cuda_binary, cuda_unary}; +#[cfg(feature = "f16")] +unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Scalar {} unsafe impl cudarc::driver::DeviceRepr for Binary {} @@ -8,8 +10,19 @@ unsafe impl cudarc::driver::DeviceRepr for Binary {} const SCALAR_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/scalar_sub.ptx")); const BINARY_PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/binary_sub.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(const_df() Scalar, half::f16, SCALAR_PTX, "ssub_fwd_f16", "ssub_bwd_f16"); cuda_unary!(const_df() Scalar, f32, SCALAR_PTX, "ssub_fwd_f32", "ssub_bwd_f32"); cuda_unary!(const_df() Scalar, f64, SCALAR_PTX, "ssub_fwd_f64", "ssub_bwd_f64"); +#[cfg(feature = "f16")] +cuda_binary!( + const_df() Binary, + half::f16, + BINARY_PTX, + "bsub_fwd_f16", + "bsub_bwd_lhs_f16", + "bsub_bwd_rhs_f16" +); cuda_binary!( const_df() Binary, f32, diff --git a/src/tensor_ops/sub/mod.rs b/src/tensor_ops/sub/mod.rs index a54c8bd70..30e4de5a4 100644 --- a/src/tensor_ops/sub/mod.rs +++ b/src/tensor_ops/sub/mod.rs @@ -69,6 +69,16 @@ impl, E>, T: Tape> } } +#[cfg(feature = "f16")] +impl, half::f16>, T: Tape> + TrySub for Tensor +{ + fn try_sub(self, rhs: f32) -> Result { + let scalar = half::f16::from_f32(rhs); + try_unary_op(ScalarSubKernelOp { scalar }, self) + } +} + impl, Rhs> std::ops::Sub for Tensor where @@ -90,8 +100,8 @@ mod tests { fn test_sub_0d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor(1.0); - let b: Tensor<_, TestDtype, _> = dev.tensor(1.0); + let a = dev.tensor(1.0).to_dtype::(); + let b = dev.tensor(1.0).to_dtype::(); let r = b.leaky_trace() - a.clone(); assert_close_to_literal!(r, 0.0); @@ -103,8 +113,8 @@ mod tests { #[test] fn test_sub_1d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); - let b: Tensor<_, TestDtype, _> = dev.tensor([1.0, -1.0, 0.0]); + let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); + let b = dev.tensor([1.0, -1.0, 0.0]).to_dtype::(); let r = b.leaky_trace() - a.clone(); assert_close_to_literal!(r, [0.0, -3.0, -3.0]); @@ -116,10 +126,12 @@ mod tests { #[test] fn test_sub_2d() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = - dev.tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]); - let b: Tensor<_, TestDtype, _> = - dev.tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]); + let a = dev + .tensor([[0.6570, 0.1708, 0.1500], [0.5658, 0.7010, 0.8342]]) + .to_dtype::(); + let b = dev + .tensor([[0.5199, 0.3844, 0.3759], [0.8259, 0.3682, 0.0388]]) + .to_dtype::(); let r = b.leaky_trace() - a.clone(); assert_close_to_literal!(r, [[-0.1371, 0.2136, 0.2259], [0.2601, -0.3328, -0.7954]]); @@ -131,7 +143,7 @@ mod tests { #[test] fn test_scalar_sub_0d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor(0.0); + let x = dev.tensor(0.0).to_dtype::(); let r = x.leaky_trace() - 1.0; assert_close_to_literal!(r, -1.0); let g = r.exp().backward(); @@ -141,7 +153,7 @@ mod tests { #[test] fn test_scalar_sub_1d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([0.0, 1.0, 2.0]); + let x = dev.tensor([0.0, 1.0, 2.0]).to_dtype::(); let r = x.leaky_trace() - 1.0; assert_close_to_literal!(r, [-1.0, 0.0, 1.0]); let g = r.exp().sum().backward(); @@ -151,7 +163,7 @@ mod tests { #[test] fn test_scalar_sub_2d() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[0.0; 2]; 3]); + let x = dev.tensor([[0.0; 2]; 3]).to_dtype::(); let r = x.leaky_trace() - 1.0; assert_close_to_literal!(r, [[-1.0; 2]; 3]); let g = r.exp().sum().backward(); diff --git a/src/tensor_ops/sub/scalar_sub.cu b/src/tensor_ops/sub/scalar_sub.cu index 67c334dc4..5dc49a334 100644 --- a/src/tensor_ops/sub/scalar_sub.cu +++ b/src/tensor_ops/sub/scalar_sub.cu @@ -5,6 +5,10 @@ struct ScalarSubKernelOp { F scalar; }; +UNARY_OP(__half, ssub_fwd_f16, ssub_bwd_f16, ScalarSubKernelOp<__half>, + x - op.scalar, + 1.0); + UNARY_OP(float, ssub_fwd_f32, ssub_bwd_f32, ScalarSubKernelOp, x - op.scalar, 1.0); diff --git a/src/tensor_ops/sum_to/cuda_kernel.rs b/src/tensor_ops/sum_to/cuda_kernel.rs index 69a8eda44..fc007b29a 100644 --- a/src/tensor_ops/sum_to/cuda_kernel.rs +++ b/src/tensor_ops/sum_to/cuda_kernel.rs @@ -15,6 +15,12 @@ trait HasCudaKernel { const FNS: &'static [&'static str]; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const MOD: &'static str = "sum_f16"; + const FNS: &'static [&'static str] = &["sum_to_fwd_f16", "sum_to_bwd_f16"]; +} + impl HasCudaKernel for Cuda { const MOD: &'static str = "sum_f32"; const FNS: &'static [&'static str] = &["sum_to_fwd_f32", "sum_to_bwd_f32"]; diff --git a/src/tensor_ops/sum_to/mod.rs b/src/tensor_ops/sum_to/mod.rs index a000c32e1..095f8eaf9 100644 --- a/src/tensor_ops/sum_to/mod.rs +++ b/src/tensor_ops/sum_to/mod.rs @@ -86,7 +86,7 @@ mod tests { #[test] fn test_sum_1d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); + let t = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); let r = t.leaky_trace().sum::(); let e = 6.0f64; assert_close_to_literal!(r, e); @@ -98,7 +98,9 @@ mod tests { #[test] fn test_sum_axis_0_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]) + .to_dtype::(); let r = t.leaky_trace().sum::, _>(); let e = [-1.0f64, 6.0, -3.0]; assert_close_to_literal!(r, e); @@ -109,7 +111,9 @@ mod tests { #[test] fn test_sum_axis_1_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0], [-2.0, 4.0, -6.0]]) + .to_dtype::(); let r = t.leaky_trace().sum::, _>(); let e = [6.0f64, -4.0]; assert_close_to_literal!(r, e); @@ -144,7 +148,7 @@ mod tests { #[test] fn test_sum_chunking() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0; 100]; 60]); + let t = dev.tensor([[1.0; 100]; 60]).to_dtype::(); let r = t.leaky_trace().sum::, _>(); assert_close_to_literal!(r, [100.0; 60]); let g = r.sum().backward(); @@ -154,7 +158,7 @@ mod tests { #[test] fn test_sum_reduce_to_more_than_physical_elements() { let dev: TestDevice = Default::default(); - let a: Tensor<_, TestDtype, _> = dev.tensor([1.0, 2.0, 3.0]); + let a = dev.tensor([1.0, 2.0, 3.0]).to_dtype::(); let b = a.broadcast::, _>(); let c = b.sum::, _>(); assert_close_to_literal!(c, [[2.0, 4.0, 6.0]; 4]); diff --git a/src/tensor_ops/sum_to/sum_to.cu b/src/tensor_ops/sum_to/sum_to.cu index 776afa37d..d0c9c7f43 100644 --- a/src/tensor_ops/sum_to/sum_to.cu +++ b/src/tensor_ops/sum_to/sum_to.cu @@ -77,5 +77,6 @@ extern "C" __global__ void BWD( \ sum_to_bwd(numel, num_dims, elems_per_thread, info, grad_inp, grad_out); \ } +SUM(__half, sum_to_fwd_f16, sum_to_bwd_f16); SUM(float, sum_to_fwd_f32, sum_to_bwd_f32); SUM(double, sum_to_fwd_f64, sum_to_bwd_f64); diff --git a/src/tensor_ops/tanh/cuda_kernel.rs b/src/tensor_ops/tanh/cuda_kernel.rs index a2e325110..c4ce875ae 100644 --- a/src/tensor_ops/tanh/cuda_kernel.rs +++ b/src/tensor_ops/tanh/cuda_kernel.rs @@ -5,5 +5,7 @@ unsafe impl cudarc::driver::DeviceRepr for TanhKernelOp {} const PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/tanh.ptx")); +#[cfg(feature = "f16")] +cuda_unary!(df(f(x)) TanhKernelOp, half::f16, PTX, "tanh_fwd_f16", "tanh_bwd_f16"); cuda_unary!(df(f(x)) TanhKernelOp, f32, PTX, "tanh_fwd_f32", "tanh_bwd_f32"); cuda_unary!(df(f(x)) TanhKernelOp, f64, PTX, "tanh_fwd_f64", "tanh_bwd_f64"); diff --git a/src/tensor_ops/tanh/mod.rs b/src/tensor_ops/tanh/mod.rs index 09d8e0317..09c67e53e 100644 --- a/src/tensor_ops/tanh/mod.rs +++ b/src/tensor_ops/tanh/mod.rs @@ -45,7 +45,9 @@ mod tests { #[test] fn test_tanh() { let dev: TestDevice = Default::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]); + let x = dev + .tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + .to_dtype::(); let r = x.leaky_trace().tanh(); assert_close_to_literal!(r, [-0.9640276, -0.7615942, 0., 0.7615942, 0.9640276]); let g = r.mean().backward(); diff --git a/src/tensor_ops/tanh/tanh.cu b/src/tensor_ops/tanh/tanh.cu index b40716b30..0a50c1e01 100644 --- a/src/tensor_ops/tanh/tanh.cu +++ b/src/tensor_ops/tanh/tanh.cu @@ -2,11 +2,21 @@ struct TanhKernelOp {}; +template +__device__ __forceinline__ T tanh_bwd(T y) { + T one = 1.0; + return one - y * y; +} + +UNARY_OP(__half, tanh_fwd_f16, tanh_bwd_f16, TanhKernelOp, + tanhg(x), + tanh_bwd(y)) + UNARY_OP(float, tanh_fwd_f32, tanh_bwd_f32, TanhKernelOp, - tanhf(x), - 1 - y * y) + tanhg(x), + tanh_bwd(y)) UNARY_OP(double, tanh_fwd_f64, tanh_bwd_f64, TanhKernelOp, - tanh(x), - 1 - y * y) + tanhg(x), + tanh_bwd(y)) \ No newline at end of file diff --git a/src/tensor_ops/to_dtype/cuda_kernel.rs b/src/tensor_ops/to_dtype/cuda_kernel.rs index ba4a2246a..b5597a422 100644 --- a/src/tensor_ops/to_dtype/cuda_kernel.rs +++ b/src/tensor_ops/to_dtype/cuda_kernel.rs @@ -4,7 +4,7 @@ use crate::{ }; use cudarc::{ driver::{DeviceSlice, LaunchAsync}, - nvrtc::compile_ptx, + nvrtc::{compile_ptx_with_opts, CompileOptions}, types::CudaTypeName, }; @@ -14,6 +14,7 @@ typedef long int intptr_t; #else typedef int intptr_t; #endif +#include \"cuda_fp16.h\" extern \"C\" __global__ void kernel(const size_t n, const $Src *inp, $Dst *out) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { out[i] = inp[i]; } @@ -26,7 +27,12 @@ impl super::ToDtypeKernel = dev.tensor( - [[[ - [1., 2., 3., 4., 5., 6.], - [1., 2., 3., 4., 5., 6.], - [1., 2., 3., 4., 5., 6.], - [1., 2., 3., 4., 5., 6.], - [1., 2., 3., 4., 5., 6.], - ]; 4]; 3], - ); + let t = dev + .tensor( + [[[ + [1., 2., 3., 4., 5., 6.], + [1., 2., 3., 4., 5., 6.], + [1., 2., 3., 4., 5., 6.], + [1., 2., 3., 4., 5., 6.], + [1., 2., 3., 4., 5., 6.], + ]; 4]; 3], + ) + .to_dtype::(); assert_close_to_literal!( t.clone().lower_tri(None), [[[ diff --git a/src/tensor_ops/upscale2d/cuda_kernel.rs b/src/tensor_ops/upscale2d/cuda_kernel.rs index 2cc7e9049..19bba285c 100644 --- a/src/tensor_ops/upscale2d/cuda_kernel.rs +++ b/src/tensor_ops/upscale2d/cuda_kernel.rs @@ -25,6 +25,16 @@ trait HasCudaKernel { const FWD: &'static str; const BWD: &'static str; } +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const FWD: &'static str = "nearest_upscale2d_fwd_f16"; + const BWD: &'static str = "nearest_upscale2d_bwd_f16"; +} +#[cfg(feature = "f16")] +impl HasCudaKernel for Cuda { + const FWD: &'static str = "bilinear_upscale2d_fwd_f16"; + const BWD: &'static str = "bilinear_upscale2d_bwd_f16"; +} impl HasCudaKernel for Cuda { const FWD: &'static str = "nearest_upscale2d_fwd_f32"; const BWD: &'static str = "nearest_upscale2d_bwd_f32"; diff --git a/src/tensor_ops/upscale2d/mod.rs b/src/tensor_ops/upscale2d/mod.rs index 8e60acd10..ea3a0e599 100644 --- a/src/tensor_ops/upscale2d/mod.rs +++ b/src/tensor_ops/upscale2d/mod.rs @@ -254,7 +254,9 @@ mod tests { fn test_upscale2d_nearest_even() { let dev = TestDevice::default(); - let x = dev.tensor([[[1.0, 0.0], [2.0, 3.0]]]); + let x = dev + .tensor([[[1.0, 0.0], [2.0, 3.0]]]) + .to_dtype::(); let y = x.leaky_trace().upscale2d::<4, 4, _>(NearestNeighbor); assert_close_to_literal!( y, @@ -277,7 +279,9 @@ mod tests { fn test_upscale2d_nearest_uneven() { let dev = TestDevice::default(); - let x = dev.tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]]); + let x = dev + .tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]]) + .to_dtype::(); let y = x.leaky_trace().upscale2d::<2, 7, _>(NearestNeighbor); assert_close_to_literal!( y, @@ -301,24 +305,23 @@ mod tests { fn test_upscale2d_nearest_batched() { let dev = TestDevice::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]); + let x = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + .to_dtype::(); let x: Tensor, _, _> = [x.clone(), x.clone(), x].stack(); let x: Tensor, _, _> = [x.clone(), x.clone(), x.clone(), x.clone(), x].stack(); let y = x.leaky_trace().upscale2d::<5, 6, _>(NearestNeighbor); - let y_array = y.array(); - for img in y_array { - assert_eq!( - img, - [[ - [1., 1., 2., 2., 3., 3.], - [1., 1., 2., 2., 3., 3.], - [1., 1., 2., 2., 3., 3.], - [4., 4., 5., 5., 6., 6.], - [4., 4., 5., 5., 6., 6.] - ]; 3] - ); - } + assert_close_to_literal!( + y, + [[[ + [1., 1., 2., 2., 3., 3.], + [1., 1., 2., 2., 3., 3.], + [1., 1., 2., 2., 3., 3.], + [4., 4., 5., 5., 6., 6.], + [4., 4., 5., 5., 6., 6.] + ]; 3]; 5] + ); let grads = y.exp().mean().backward(); assert_close_to_literal!( @@ -335,7 +338,9 @@ mod tests { fn test_upscale2d_bilinear_even() { let dev = TestDevice::default(); - let x = dev.tensor([[[1.0, 0.0], [2.0, 3.0]]]); + let x = dev + .tensor([[[1.0, 0.0], [2.0, 3.0]]]) + .to_dtype::(); let y = x.leaky_trace().upscale2d::<4, 4, _>(Bilinear); assert_close_to_literal!( y, @@ -358,7 +363,9 @@ mod tests { fn test_upscale2d_bilinear_uneven() { let dev = TestDevice::default(); - let x = dev.tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]]); + let x = dev + .tensor([[[1.0, 0.0, 2.0], [2.0, 3.0, 4.0]]]) + .to_dtype::(); let y = x.leaky_trace().upscale2d::<2, 7, _>(Bilinear); assert_close_to_literal!( y, @@ -382,7 +389,9 @@ mod tests { fn test_bilinear_upscale2d_batched() { let dev = TestDevice::default(); - let x: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]); + let x = dev + .tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + .to_dtype::(); let x: Tensor, _, _> = [x.clone(), x.clone(), x].stack(); let x: Tensor, _, _> = [x.clone(), x.clone(), x.clone(), x.clone(), x].stack(); diff --git a/src/tensor_ops/upscale2d/upscale2d.cu b/src/tensor_ops/upscale2d/upscale2d.cu index 49ad1b91a..2953960ea 100644 --- a/src/tensor_ops/upscale2d/upscale2d.cu +++ b/src/tensor_ops/upscale2d/upscale2d.cu @@ -106,9 +106,11 @@ __device__ void bilinear_upscale2d_fwd( inp += b * inp_strides[0] + c * inp_strides[1]; - T ll = inp[y0 * inp_strides[2] + x0 * inp_strides[3]] * (1-hs) * (1-ws); - T lh = inp[y0 * inp_strides[2] + x1 * inp_strides[3]] * (1-hs) * ws; - T hl = inp[y1 * inp_strides[2] + x0 * inp_strides[3]] * hs * (1-ws); + T one = 1.0; + + T ll = inp[y0 * inp_strides[2] + x0 * inp_strides[3]] * (one-hs) * (one-ws); + T lh = inp[y0 * inp_strides[2] + x1 * inp_strides[3]] * (one-hs) * ws; + T hl = inp[y1 * inp_strides[2] + x0 * inp_strides[3]] * hs * (one-ws); T hh = inp[y1 * inp_strides[2] + x1 * inp_strides[3]] * hs * ws; out[i] = ll + lh + hl + hh; @@ -150,9 +152,11 @@ __device__ void bilinear_upscale2d_bwd( grad_inp += b * inp_strides[0] + c * inp_strides[1]; - atomicAdd(grad_inp + y0 * inp_strides[2] + x0 * inp_strides[3], go * (1-hs) * (1-ws)); - atomicAdd(grad_inp + y0 * inp_strides[2] + x1 * inp_strides[3], go * (1-hs) * ws); - atomicAdd(grad_inp + y1 * inp_strides[2] + x0 * inp_strides[3], go * hs * (1-ws)); + const T one = 1.0; + + atomicAdd(grad_inp + y0 * inp_strides[2] + x0 * inp_strides[3], go * (one-hs) * (one-ws)); + atomicAdd(grad_inp + y0 * inp_strides[2] + x1 * inp_strides[3], go * (one-hs) * ws); + atomicAdd(grad_inp + y1 * inp_strides[2] + x0 * inp_strides[3], go * hs * (one-ws)); atomicAdd(grad_inp + y1 * inp_strides[2] + x1 * inp_strides[3], go * hs * ws); } @@ -175,13 +179,19 @@ extern "C" __global__ void bwd( \ } UPSCALE_OP( - float, - nearest_upscale2d_fwd_f32, nearest_upscale2d_bwd_f32, + __half, + nearest_upscale2d_fwd_f16, nearest_upscale2d_bwd_f16, nearest_upscale2d_fwd, nearest_upscale2d_bwd ); UPSCALE_OP( - double, - nearest_upscale2d_fwd_f64, nearest_upscale2d_bwd_f64, + __half, + bilinear_upscale2d_fwd_f16, bilinear_upscale2d_bwd_f16, + bilinear_upscale2d_fwd, bilinear_upscale2d_bwd +); + +UPSCALE_OP( + float, + nearest_upscale2d_fwd_f32, nearest_upscale2d_bwd_f32, nearest_upscale2d_fwd, nearest_upscale2d_bwd ); UPSCALE_OP( @@ -189,6 +199,11 @@ UPSCALE_OP( bilinear_upscale2d_fwd_f32, bilinear_upscale2d_bwd_f32, bilinear_upscale2d_fwd, bilinear_upscale2d_bwd ); +UPSCALE_OP( + double, + nearest_upscale2d_fwd_f64, nearest_upscale2d_bwd_f64, + nearest_upscale2d_fwd, nearest_upscale2d_bwd +); UPSCALE_OP( double, bilinear_upscale2d_fwd_f64, bilinear_upscale2d_bwd_f64, diff --git a/src/tensor_ops/utilities/binary_op_macros.cuh b/src/tensor_ops/utilities/binary_op_macros.cuh index 0212c215f..9878c3239 100644 --- a/src/tensor_ops/utilities/binary_op_macros.cuh +++ b/src/tensor_ops/utilities/binary_op_macros.cuh @@ -68,8 +68,9 @@ extern "C" __global__ void BACKWARD_LHS( \ tmp_i /= dims[d]; \ } \ unsigned int lhs_i = i / chunk_len; \ - TYPENAME x = lhs ? lhs[lhs_i] : 0; \ - TYPENAME y = rhs ? rhs[rhs_i] : 0; \ + TYPENAME zero = 0.0; \ + TYPENAME x = lhs ? lhs[lhs_i] : zero; \ + TYPENAME y = rhs ? rhs[rhs_i] : zero; \ TYPENAME go = grad_out[out_i]; \ \ TYPENAME dfdx = (DFDX); \ @@ -107,8 +108,9 @@ extern "C" __global__ void BACKWARD_RHS( \ } \ unsigned int rhs_i = i / chunk_len; \ \ - TYPENAME x = lhs ? lhs[lhs_i] : 0; \ - TYPENAME y = rhs ? rhs[rhs_i] : 0; \ + TYPENAME zero = 0.0; \ + TYPENAME x = lhs ? lhs[lhs_i] : zero; \ + TYPENAME y = rhs ? rhs[rhs_i] : zero; \ TYPENAME go = grad_out[out_i]; \ \ TYPENAME dfdy = (DFDY); \ diff --git a/src/tensor_ops/utilities/compatibility.cuh b/src/tensor_ops/utilities/compatibility.cuh new file mode 100644 index 000000000..93195a7f4 --- /dev/null +++ b/src/tensor_ops/utilities/compatibility.cuh @@ -0,0 +1,171 @@ +#include "cuda_fp16.h" + +// Table showing which features are supported on which compute capability +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications + +// FIXME: the minimum compute capabilities are just guesses since the table is not specific enough + +#if __CUDA_ARCH__ < 600 +__device__ __forceinline__ __half __hmax(__half a, __half b) { + return __float2half(fmaxf(__half2float(a), __half2float(b))); +} +__device__ __forceinline__ __half __hmin(__half a, __half b) { + return __float2half(fminf(__half2float(a), __half2float(b))); +} +#endif + +#if __CUDA_ARCH__ < 700 +__device__ __forceinline__ __half __hmax_nan(__half a, __half b) { + return __hisnan(a) ? a : (__hisnan(b) ? b : __hmax(a, b)); +} +__device__ __forceinline__ __half __hmin_nan(__half a, __half b) { + return __hisnan(a) ? a : (__hisnan(b) ? b : __hmin(a, b)); +} +#endif + +#if __CUDA_ARCH__ < 600 +// Copied from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions +__device__ double atomicAdd(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif + + +#if __CUDA_ARCH__ < 700 +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicadd +// The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher. +// Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119 +__device__ __half atomicAdd(__half *address, __half val) { + unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + bool unaligned = (size_t) address & 2; + do { + assumed = old; + unsigned int hsum; + hsum = unaligned ? (old >> 16) : (old & 0xffff); + hsum = __half_as_ushort(__ushort_as_half(hsum) + val); + old = atomicCAS(address_as_ui, assumed, + unaligned ? (old & 0xffff) | (hsum << 16) : (old & 0xffff0000) | hsum + ); + + } while (assumed != old); + return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff)); +} +#endif + + +__device__ __forceinline__ __half atomicMaxf(__half* address, __half val) { +#if __CUDA_ARCH__ < 700 + // On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery. + // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119 + unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + bool unaligned = (size_t) address & 2; + do { + assumed = old; + unsigned int hmax; + hmax = unaligned ? (old >> 16) : (old & 0xffff); + hmax = __half_as_ushort(__hmax_nan(val, __ushort_as_half(hmax))); + old = atomicCAS(address_as_ui, assumed, + unaligned ? (old & 0xffff) | (hmax << 16) : (old & 0xffff0000) | hmax + ); + + } while (assumed != old); + return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff)); +#else + // Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions + unsigned short int* casted_address = (unsigned short int*)address; + unsigned short int old = *casted_address; + unsigned short int assumed; + do { + assumed = old; + old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmax_nan(val, __ushort_as_half(assumed)))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __ushort_as_half(old); +#endif +} + +// atomicMax is not implemented for floats, +// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ float atomicMaxf(float * addr, float value) { + if (signbit(value)) { + return __uint_as_float(atomicMin((unsigned int *)addr, __float_as_uint(value))); + } else { + return __int_as_float(atomicMax((int *)addr, __float_as_int(value))); + } +} + +__device__ __forceinline__ double atomicMaxf(double * addr, double value) { + if (signbit(value)) { + return __longlong_as_double(atomicMin((unsigned long long int *)addr, __double_as_longlong(value))); + } else { + return __longlong_as_double(atomicMax((long long int *)addr, __double_as_longlong(value))); + } +} + + +__device__ __forceinline__ __half atomicMinf(__half* address, __half val) { +#if __CUDA_ARCH__ < 700 + // On older GPUs we do not have access to atomicCAS for shorts, so we have to do some trickery. + // Solution adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh#L96-L119 + unsigned int *address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + bool unaligned = (size_t) address & 2; + do { + assumed = old; + unsigned int hmin; + hmin = unaligned ? (old >> 16) : (old & 0xffff); + hmin = __half_as_ushort(__hmin_nan(val, __ushort_as_half(hmin))); + old = atomicCAS(address_as_ui, assumed, + unaligned ? (old & 0xffff) | (hmin << 16) : (old & 0xffff0000) | hmin + ); + + } while (assumed != old); + return __ushort_as_half(unaligned ? (old >> 16) : (old & 0xffff)); +#else + // Based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/#atomic-functions + unsigned short int* casted_address = (unsigned short int*)address; + unsigned short int old = *casted_address; + unsigned short int assumed; + do { + assumed = old; + old = atomicCAS(casted_address, assumed, __half_as_ushort(__hmin_nan(val, __ushort_as_half(assumed)))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __ushort_as_half(old); +#endif +} + +// atomicMin is not implemented for floats, +// solution copied https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ float atomicMinf(float * addr, float value) { + if (signbit(value)) { + return __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value))); + } else { + return __int_as_float(atomicMin((int *)addr, __float_as_int(value))); + } +} + +__device__ __forceinline__ double atomicMinf(double * addr, double value) { + if (signbit(value)) { + return __longlong_as_double(atomicMax((unsigned long long int *)addr, __double_as_longlong(value))); + } else { + return __longlong_as_double(atomicMin((long long int *)addr, __double_as_longlong(value))); + } +} \ No newline at end of file diff --git a/src/tensor_ops/utilities/cuda_utils.cuh b/src/tensor_ops/utilities/cuda_utils.cuh index 0ebe1470a..5915107f8 100644 --- a/src/tensor_ops/utilities/cuda_utils.cuh +++ b/src/tensor_ops/utilities/cuda_utils.cuh @@ -1,4 +1,5 @@ #include "cuda_fp16.h" +#include "compatibility.cuh" __device__ unsigned int get_strided_index( unsigned int idx, @@ -93,6 +94,14 @@ __device__ void chunk_sum( } } +extern "C" __global__ void fill_with_f16(__half *buf, __half value, const size_t numel) { + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= numel) { + return; + } + buf[i] = value; +} + extern "C" __global__ void fill_with_f32(float *buf, float value, const size_t numel) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= numel) { @@ -109,21 +118,43 @@ extern "C" __global__ void fill_with_f64(double *buf, double value, const size_t buf[i] = value; } + +__device__ __forceinline__ bool isnang(float a) { return isnan(a); } +__device__ __forceinline__ bool isnang(double a) { return isnan(a); } +__device__ __forceinline__ bool isnang(__half a) { return __hisnan(a); } +__device__ __forceinline__ float recipg(float a) { return 1.0 / a; } +__device__ __forceinline__ double recipg(double a) { return 1.0 / a; } +__device__ __forceinline__ __half recipg(__half a) { __half one = 1.0; return one / a; } +__device__ __forceinline__ float cosg(float a) { return cosf(a); } +__device__ __forceinline__ double cosg(double a) { return cos(a); } +__device__ __forceinline__ __half cosg(__half a) { return hcos(a); } +__device__ __forceinline__ float sing(float a) { return sinf(a); } +__device__ __forceinline__ double sing(double a) { return sin(a); } +__device__ __forceinline__ __half sing(__half a) { return hsin(a); } __device__ __forceinline__ float sqrtg(float a) { return sqrtf(a); } __device__ __forceinline__ double sqrtg(double a) { return sqrt(a); } +__device__ __forceinline__ __half sqrtg(__half a) { return hsqrt(a); } __device__ __forceinline__ float powg(float a, float b) { return powf(a, b); } __device__ __forceinline__ double powg(double a, double b) { return pow(a, b); } +__device__ __forceinline__ __half powg(__half a, __half b) { return __float2half(powf(__half2float(a), __half2float(b))); } __device__ __forceinline__ float tanhg(float a) { return tanhf(a); } __device__ __forceinline__ double tanhg(double a) { return tanh(a); } +__device__ __forceinline__ __half tanhg(__half a) { return __float2half(tanhf(__half2float(a))); } __device__ __forceinline__ float maxg(float a, float b) { return fmaxf(a, b); } __device__ __forceinline__ double maxg(double a, double b) { return fmax(a, b); } +__device__ __forceinline__ __half maxg(__half a, __half b) { return __hmax_nan(a, b); } __device__ __forceinline__ float ming(float a, float b) { return fminf(a, b); } __device__ __forceinline__ double ming(double a, double b) { return fmin(a, b); } +__device__ __forceinline__ __half ming(__half a, __half b) { return __hmin_nan(a, b); } __device__ __forceinline__ float logg(float a) { return logf(a); } __device__ __forceinline__ double logg(double a) { return log(a); } +__device__ __forceinline__ __half logg(__half a) { return hlog(a); } __device__ __forceinline__ float expg(float a) { return expf(a); } __device__ __forceinline__ double expg(double a) { return exp(a); } +__device__ __forceinline__ __half expg(__half a) { return hexp(a); } __device__ __forceinline__ float absg(float a) { return fabsf(a); } __device__ __forceinline__ double absg(double a) { return fabs(a); } +__device__ __forceinline__ __half absg(__half a) { return __habs(a); } __device__ __forceinline__ float copysigng(float a, float b) { return copysignf(a, b); } __device__ __forceinline__ double copysigng(double a, double b) { return copysign(a, b); } +__device__ __forceinline__ __half copysigng(__half a, __half b) { return __float2half(copysignf(__half2float(a), __half2float(b))); } diff --git a/src/tensor_ops/utilities/device.rs b/src/tensor_ops/utilities/device.rs index c209bd9fd..e20cee99d 100644 --- a/src/tensor_ops/utilities/device.rs +++ b/src/tensor_ops/utilities/device.rs @@ -102,9 +102,14 @@ pub trait Device: { } +#[cfg(feature = "f16")] +impl Device for crate::tensor::Cpu {} impl Device for crate::tensor::Cpu {} impl Device for crate::tensor::Cpu {} +#[cfg(all(feature = "cuda", feature = "f16"))] +impl Device for crate::tensor::Cuda {} + #[cfg(feature = "cuda")] impl Device for crate::tensor::Cuda {} diff --git a/src/tensor_ops/utilities/unary_op_macros.cuh b/src/tensor_ops/utilities/unary_op_macros.cuh index e86322e61..4fafc7a13 100644 --- a/src/tensor_ops/utilities/unary_op_macros.cuh +++ b/src/tensor_ops/utilities/unary_op_macros.cuh @@ -1,3 +1,5 @@ +#include "cuda_utils.cuh" + #define LONG_UNARY_OP(TYPENAME, FORWARD, BACKWARD, OP_STRUCT, FUNC, DERIVATIVE) \ extern "C" __global__ void FORWARD( \ const OP_STRUCT op, \ @@ -26,8 +28,9 @@ extern "C" __global__ void BACKWARD( \ return; \ } \ \ - TYPENAME x = inp ? inp[i] : 0; \ - TYPENAME y = out ? out[i] : 0; \ + TYPENAME zero = 0.0; \ + TYPENAME x = inp ? inp[i] : zero; \ + TYPENAME y = out ? out[i] : zero; \ TYPENAME dx; \ DERIVATIVE \ grad_inp[i] += dx * grad_out[i]; \ diff --git a/src/tensor_ops/var_to.rs b/src/tensor_ops/var_to.rs index 423cf8b59..8dd5f003d 100644 --- a/src/tensor_ops/var_to.rs +++ b/src/tensor_ops/var_to.rs @@ -48,7 +48,9 @@ mod tests { #[test] fn test_var_axis_0_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]) + .to_dtype::(); let r = t.leaky_trace().var::, _>(); assert_close_to_literal!(r, [0.25, 0.0, 1.0, 9.0]); let g = r.mean().backward(); @@ -61,7 +63,9 @@ mod tests { #[test] fn test_var_axis_1_2d() { let dev: TestDevice = Default::default(); - let t: Tensor<_, TestDtype, _> = dev.tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]); + let t = dev + .tensor([[1.0, 2.0, 3.0, 4.0], [0.0, 2.0, 5.0, 10.0]]) + .to_dtype::(); let r = t.leaky_trace().var::, _>(); assert_close_to_literal!(r, [1.25, 14.1875]); let g = r.mean().backward();