diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 9ccf988492d22..262550c5ed930 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -12,7 +12,7 @@
 #include <vector>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 
 constexpr int N = 60; // divisible by all tested array sizes
 constexpr float bf16_eps = 0.00390625;
@@ -222,7 +222,7 @@ bool check(float a, float b) {
 int main() {
   queue q;
 
-  if (q.get_device().has(aspect::ext_oneapi_bfloat16)) {
+  if (q.get_device().has(aspect::ext_oneapi_bfloat16_math_functions)) {
     std::vector<float> a(N), b(N), c(N);
     int err = 0;
 
diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
new file mode 100755
index 0000000000000..ed5d7ae583a09
--- /dev/null
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -0,0 +1,70 @@
+// UNSUPPORTED: hip
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// Currently the feature is supported only on CPU and GPU, natively or by
+// software emulation.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+//==---------- bfloat16_conversions.cpp - SYCL bfloat16 type test ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T> T calculate(T a, T b) {
+  sycl::ext::oneapi::bfloat16 x = -a;
+  sycl::ext::oneapi::bfloat16 y = b;
+  sycl::ext::oneapi::bfloat16 z = x + y;
+  T result = z;
+  return result;
+}
+
+template <typename T> int test_device(queue Q) {
+  T data[3] = {-7.0f, 8.1f, 0.0f};
+
+  buffer<T, 1> buf{data, 3};
+  Q.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = calculate(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  std::cout << "Device Result = " << hostOutAcc[2] << std::endl;
+  if (hostOutAcc[2] == 15.125f)
+    return 0;
+  return 1;
+}
+
+template <typename T> int test_host() {
+  T a{-5.6f};
+  T b{-1.1f};
+  T result = calculate(a, b);
+  std::cout << "Host Result = " << result << std::endl;
+  if (result == 4.5f)
+    return 0;
+  return 1;
+}
+
+int main() {
+  queue Q;
+  int result;
+  result = test_host<sycl::half>();
+  result |= test_host<float>();
+  if (Q.get_device().has(aspect::fp16))
+    result |= test_device<sycl::half>(Q);
+  result |= test_device<float>(Q);
+  if (result)
+    std::cout << "FAIL\n";
+  else
+    std::cout << "PASS\n";
+
+  return result;
+}
diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
new file mode 100755
index 0000000000000..fab3795679b9d
--- /dev/null
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -0,0 +1,83 @@
+///
+/// Check if bfloat16 example works using fallback libraries
+///
+
+// REQUIRES: opencl-aot, ocloc, cpu, gpu-intel-gen9
+// UNSUPPORTED: cuda
+// CUDA is not compatible with SPIR.
+
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using sycl::ext::oneapi::bfloat16;
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A{a};
+  bfloat16 B{b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-point
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main(int argc, char *argv[]) {
+  float data[3] = {7.0f, 8.1f, 0.0f};
+
+  float result_host = foo(7.0f, 8.1f);
+  std::cout << "CPU Result = " << result_host << std::endl;
+  if (std::abs(15.1f - result_host) > 0.1f) {
+    std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl;
+    return 1;
+  }
+
+  queue deviceQueue;
+  buffer<float, 1> buf{data, 3};
+
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  float result_device = hostOutAcc[2];
+  std::cout << "GPU Result = " << result_device << std::endl;
+  if (std::abs(result_host - result_device) > 0.1f) {
+    std::cout << "Test failed. CPU Result !~= GPU result" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp
index 25bb8ac15cf70..28f1bf621b670 100644
--- a/SYCL/BFloat16/bfloat16_type.cpp
+++ b/SYCL/BFloat16/bfloat16_type.cpp
@@ -1,10 +1,11 @@
-// UNSUPPORTED: cuda || hip
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// TODO currently the feature isn't supported on most of the devices
-//      need to enable the test when the aspect and device_if feature are
-//      introduced
-// RUNx: %CPU_RUN_PLACEHOLDER %t.out
-// RUNx: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// RUN: %if cuda %{%clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out %}
+// TODO enable the below when CI supports >=sm_80
+// RUNx: %if cuda %{%GPU_RUN_PLACEHOLDER %t.out %}
+// RUN: %clangxx -fsycl %s -o %t.out
+// TODO currently the feature isn't supported on FPGA.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUNx: %ACC_RUN_PLACEHOLDER %t.out
 
 //==----------- bfloat16_type.cpp - SYCL bfloat16 type test ----------------==//
diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index 97e1ccb7fcf01..570755bf677af 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
 #include <cmath>
@@ -11,8 +11,7 @@ constexpr size_t N = 100;
 template <typename T> void assert_close(const T &C, const float ref) {
   for (size_t i = 0; i < N; i++) {
     auto diff = C[i] - ref;
-    assert(std::fabs(static_cast<float>(diff)) <
-           std::numeric_limits<float>::epsilon());
+    assert(std::fabs(static_cast<float>(diff)) < 0.1);
   }
 }
 
@@ -21,7 +20,7 @@ void verify_conv_implicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
       A[index] = AVal;
     });
   });
@@ -34,9 +33,8 @@ void verify_conv_explicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv_impl>(r, [=](id<1> index) {
-      uint16_t AVal =
-          sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]);
-      A[index] = sycl::ext::oneapi::experimental::bfloat16::to_float(AVal);
+      sycl::ext::oneapi::bfloat16 AVal = A[index];
+      A[index] = float(AVal);
     });
   });
 
@@ -52,9 +50,9 @@ void verify_add(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_add_expl>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal + BVal;
       C[index] = CVal;
     });
   });
@@ -71,9 +69,9 @@ void verify_sub(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_sub>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal - BVal;
       C[index] = CVal;
     });
   });
@@ -88,8 +86,8 @@ void verify_minus(queue &q, buffer<float, 1> &a, range<1> &r, const float ref) {
     auto A = a.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_minus>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 CVal = -AVal;
       C[index] = CVal;
     });
   });
@@ -106,9 +104,9 @@ void verify_mul(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_mul>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal * BVal;
       C[index] = CVal;
     });
   });
@@ -125,9 +123,9 @@ void verify_div(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_div>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal / BVal;
       C[index] = CVal;
     });
   });
@@ -144,19 +142,18 @@ void verify_logic(queue &q, buffer<float, 1> &a, buffer<float, 1> &b,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class logic>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
       if (AVal) {
         if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal ||
             !BVal) {
-          sycl::ext::oneapi::experimental::bfloat16 CVal =
-              AVal != BVal ? AVal : BVal;
+          sycl::ext::oneapi::bfloat16 CVal = AVal != BVal ? AVal : BVal;
           CVal--;
           CVal++;
           if (AVal == BVal) {
             CVal -= AVal;
-            CVal *= 3.0;
-            CVal /= 2.0;
+            CVal *= 3.0f;
+            CVal /= 2.0f;
           } else
             CVal += BVal;
           C[index] = CVal;
@@ -179,9 +176,9 @@ int run_tests() {
     return 0;
   }
 
-  std::vector<float> vec_a(N, 5.0);
-  std::vector<float> vec_b(N, 2.0);
-  std::vector<float> vec_b_neg(N, -2.0);
+  std::vector<float> vec_a(N, 5.0f);
+  std::vector<float> vec_b(N, 2.0f);
+  std::vector<float> vec_b_neg(N, -2.0f);
 
   range<1> r(N);
   buffer<float, 1> a{vec_a.data(), r};
@@ -190,19 +187,32 @@ int run_tests() {
 
   queue q{dev};
 
-  verify_conv_implicit(q, a, r, 5.0);
-  verify_conv_explicit(q, a, r, 5.0);
-  verify_add(q, a, b, r, 7.0);
-  verify_sub(q, a, b, r, 3.0);
-  verify_mul(q, a, b, r, 10.0);
-  verify_div(q, a, b, r, 2.5);
-  verify_logic(q, a, b, r, 7.0);
-  verify_add(q, a, b_neg, r, 3.0);
-  verify_sub(q, a, b_neg, r, 7.0);
-  verify_minus(q, a, r, -5.0);
-  verify_mul(q, a, b_neg, r, -10.0);
-  verify_div(q, a, b_neg, r, -2.5);
-  verify_logic(q, a, b_neg, r, 3.0);
+  verify_conv_implicit(q, a, r, 5.0f);
+  std::cout << "PASS verify_conv_implicit\n";
+  verify_conv_explicit(q, a, r, 5.0f);
+  std::cout << "PASS verify_conv_explicit\n";
+  verify_add(q, a, b, r, 7.0f);
+  std::cout << "PASS verify_add\n";
+  verify_sub(q, a, b, r, 3.0f);
+  std::cout << "PASS verify_sub\n";
+  verify_mul(q, a, b, r, 10.0f);
+  std::cout << "PASS verify_mul\n";
+  verify_div(q, a, b, r, 2.5f);
+  std::cout << "PASS verify_div\n";
+  verify_logic(q, a, b, r, 7.0f);
+  std::cout << "PASS verify_logic\n";
+  verify_add(q, a, b_neg, r, 3.0f);
+  std::cout << "PASS verify_add\n";
+  verify_sub(q, a, b_neg, r, 7.0f);
+  std::cout << "PASS verify_sub\n";
+  verify_minus(q, a, r, -5.0f);
+  std::cout << "PASS verify_minus\n";
+  verify_mul(q, a, b_neg, r, -10.0f);
+  std::cout << "PASS verify_mul\n";
+  verify_div(q, a, b_neg, r, -2.5f);
+  std::cout << "PASS verify_div\n";
+  verify_logic(q, a, b_neg, r, 3.0f);
+  std::cout << "PASS verify_logic\n";
 
   return 0;
 }
diff --git a/SYCL/BFloat16/bfloat16_type_cuda.cpp b/SYCL/BFloat16/bfloat16_type_cuda.cpp
index 30d1f122a2534..81c4a08f128e4 100644
--- a/SYCL/BFloat16/bfloat16_type_cuda.cpp
+++ b/SYCL/BFloat16/bfloat16_type_cuda.cpp
@@ -12,13 +12,4 @@
 
 #include "bfloat16_type.hpp"
 
-int main() {
-  bool has_bfloat16_aspect = false;
-  for (const auto &plt : sycl::platform::get_platforms()) {
-    if (plt.has(aspect::ext_oneapi_bfloat16))
-      has_bfloat16_aspect = true;
-  }
-
-  if (has_bfloat16_aspect)
-    return run_tests();
-}
+int main() { return run_tests(); }
diff --git a/SYCL/BFloat16/bfloat_hw.cpp b/SYCL/BFloat16/bfloat_hw.cpp
index 29d63c7fa9b3e..bb1a2a1e1f7b9 100644
--- a/SYCL/BFloat16/bfloat_hw.cpp
+++ b/SYCL/BFloat16/bfloat_hw.cpp
@@ -16,7 +16,7 @@ using get_uint_type_of_size = typename std::conditional_t<
         std::conditional_t<Size == 4, uint32_t,
                            std::conditional_t<Size == 8, uint64_t, void>>>>;
 
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using Bfloat16StorageT = get_uint_type_of_size<sizeof(bfloat16)>;
 
 bool test(float Val, Bfloat16StorageT Bits) {
diff --git a/SYCL/Basic/aspects.cpp b/SYCL/Basic/aspects.cpp
index 2f9938b58d018..bb1adf0bbdf1e 100644
--- a/SYCL/Basic/aspects.cpp
+++ b/SYCL/Basic/aspects.cpp
@@ -54,8 +54,8 @@ int main() {
     if (plt.has(aspect::fp64)) {
       std::cout << "  fp64" << std::endl;
     }
-    if (plt.has(aspect::ext_oneapi_bfloat16)) {
-      std::cout << " ext_oneapi_bfloat16" << std::endl;
+    if (plt.has(aspect::ext_oneapi_bfloat16_math_functions)) {
+      std::cout << " ext_oneapi_bfloat16_math_functions" << std::endl;
     }
     if (plt.has(aspect::int64_base_atomics)) {
       std::cout << "  base atomic operations" << std::endl;
diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
index ca0da71199d67..e150081005512 100644
--- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
+++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
@@ -30,7 +30,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
 template <class T1, class T2, int VL, class OpClass, class Ops> class TestID;
diff --git a/SYCL/ESIMD/api/replicate_smoke.cpp b/SYCL/ESIMD/api/replicate_smoke.cpp
index 159b842a94be3..d8d4d1eb9baeb 100644
--- a/SYCL/ESIMD/api/replicate_smoke.cpp
+++ b/SYCL/ESIMD/api/replicate_smoke.cpp
@@ -23,7 +23,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
 template <class T> struct char_to_int {
diff --git a/SYCL/ESIMD/api/simd_copy_to_from.cpp b/SYCL/ESIMD/api/simd_copy_to_from.cpp
index 95be590cb73b8..d3f21cce9ee86 100644
--- a/SYCL/ESIMD/api/simd_copy_to_from.cpp
+++ b/SYCL/ESIMD/api/simd_copy_to_from.cpp
@@ -38,7 +38,7 @@
 using namespace sycl;
 using namespace sycl::ext::intel;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
 template <typename T, int N, typename Flags>
diff --git a/SYCL/ESIMD/api/simd_subscript_operator.cpp b/SYCL/ESIMD/api/simd_subscript_operator.cpp
index d081640de4aa9..79d1c75482e02 100644
--- a/SYCL/ESIMD/api/simd_subscript_operator.cpp
+++ b/SYCL/ESIMD/api/simd_subscript_operator.cpp
@@ -24,7 +24,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
 template <class T> bool test(queue &q) {
diff --git a/SYCL/ESIMD/api/simd_view_subscript_operator.cpp b/SYCL/ESIMD/api/simd_view_subscript_operator.cpp
index f9f81db148db5..c7da212153581 100644
--- a/SYCL/ESIMD/api/simd_view_subscript_operator.cpp
+++ b/SYCL/ESIMD/api/simd_view_subscript_operator.cpp
@@ -25,7 +25,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
 template <class T> class TestID;
diff --git a/SYCL/ESIMD/api/svm_gather_scatter.cpp b/SYCL/ESIMD/api/svm_gather_scatter.cpp
index 09cf3e9d30b70..0775389908998 100644
--- a/SYCL/ESIMD/api/svm_gather_scatter.cpp
+++ b/SYCL/ESIMD/api/svm_gather_scatter.cpp
@@ -25,7 +25,7 @@
 using namespace sycl;
 using namespace sycl::ext::intel;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32;
 
 template <typename T, int N> bool test(queue &Q) {
diff --git a/SYCL/ESIMD/api/unary_ops_heavy.cpp b/SYCL/ESIMD/api/unary_ops_heavy.cpp
index 71206d0ed5db0..51891e1da674c 100644
--- a/SYCL/ESIMD/api/unary_ops_heavy.cpp
+++ b/SYCL/ESIMD/api/unary_ops_heavy.cpp
@@ -30,7 +30,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <class T, int VL, class Ops> class TestID;
 
diff --git a/SYCL/ESIMD/esimd_test_utils.hpp b/SYCL/ESIMD/esimd_test_utils.hpp
index 07c03a32e496f..06df7b4dff522 100644
--- a/SYCL/ESIMD/esimd_test_utils.hpp
+++ b/SYCL/ESIMD/esimd_test_utils.hpp
@@ -553,7 +553,7 @@ TID(uint32_t)
 TID(int64_t)
 TID(uint64_t)
 TID(half)
-TID(sycl::ext::oneapi::experimental::bfloat16)
+TID(sycl::ext::oneapi::bfloat16)
 TID(sycl::ext::intel::experimental::esimd::tfloat32)
 TID(float)
 TID(double)
diff --git a/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp b/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp
index a3661cab569c1..7d0c7dcc5f5d6 100644
--- a/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp
+++ b/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp
@@ -35,4 +35,6 @@ int main() {
   } catch (...) {
     // Ignore all exceptions
   }
+
+  return 0;
 }
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index bc11434cef7e6..35cbef3632b5d 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -15,7 +15,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 32
 constexpr size_t nWGperDim = 2;
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
index 71a7f9fa668b0..5222b8d0255c4 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
@@ -14,7 +14,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 32
 constexpr size_t nWGperDim = 2;
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index 0464f79f57561..558451a3dbcae 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -14,7 +14,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define SG_SZ 32
diff --git a/SYCL/Matrix/joint_matrix_bfloat16.cpp b/SYCL/Matrix/joint_matrix_bfloat16.cpp
index 4d35b5f5a53a4..e665617156f0b 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16.cpp
@@ -16,7 +16,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 16
 
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
index 483b15b567f4a..456b040b3659f 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
@@ -18,7 +18,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 16
 
@@ -144,13 +144,13 @@ int main() {
     for (int j = 0; j < MATRIX_K; j++) {
       // bfloat16 is created using unsigned short since conversion from float to
       // bfloat16 is not supported on the host side yet
-      A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j)));
+      A[i][j] = make_bf16(1.0f * (i + j));
       Aref[i][j] = make_bf16(1.0f * (i + j));
     }
   }
   for (int i = 0; i < MATRIX_K / 2; i++) {
     for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j)));
+      B[i][j] = make_bf16(2.0f * i + 3.0f * j);
       Bref[i][j] = make_bf16(2.0f * i + 3.0f * j);
     }
   }
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_use.cpp b/SYCL/Matrix/joint_matrix_bfloat16_use.cpp
index 5c110336e3fa7..aa6412195e107 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_use.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_use.cpp
@@ -17,7 +17,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 16
 
diff --git a/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp b/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp
index d98a389f1ede9..84b399fadf152 100644
--- a/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcores_legacy.cpp
@@ -11,7 +11,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 constexpr float bf16_eps = 0.00390625;