diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 360069998f198..1780a553d73d1 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -13,6 +13,11 @@
 #include <ATen/NamedTensorUtils.h>
 #include <torch/library.h>
 
+#ifdef USE_FBGEMM
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmConvert.h>
+#endif
+
 namespace {
 
 using namespace at;
@@ -94,6 +99,31 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   TORCH_CHECK(self.defined(), "self is undefined");
   TORCH_CHECK(src.defined(), "src is undefined");
 
+  // FBGeMM kernel support exists only for the following case,
+  // 1. Memory Format for source and destination tensors is contiguous.
+  // 2. Device for both the source and destination tensor is CPU.
+  // 3. dtype conversion between FP32->FP16 and FP16->FP32.
+  #ifdef USE_FBGEMM
+    if (((self.dtype() == at::kFloat && src.dtype() == at::kHalf) ||
+         (self.dtype() == at::kHalf && src.dtype() == at::kFloat)) &&
+        (self.device().is_cpu() && src.device().is_cpu()) &&
+        !self.is_sparse() && !src.is_sparse() &&
+        ((self.is_contiguous() && src.is_contiguous()) ||
+         (self.is_non_overlapping_and_dense() && self.strides() == src.strides()))) {
+      if (src.dtype() == at::kFloat && self.dtype() == at::kHalf) {
+        auto* output_ptr = reinterpret_cast<fbgemm::float16*>(
+            self.data_ptr<at::Half>());
+        fbgemm::FloatToFloat16_simd(src.data_ptr<float>(), output_ptr, self.numel());
+      } else {
+        auto in_data = reinterpret_cast<fbgemm::float16*>(
+            src.data_ptr<at::Half>());
+        auto* output_ptr = self.data_ptr<float>();
+        fbgemm::Float16ToFloat_simd(in_data, output_ptr, self.numel());
+      }
+      return self;
+    }
+  #endif
+
   if (self.is_sparse() && src.is_sparse()) {
     return at::copy_sparse_to_sparse_(self, src, non_blocking);
   } else if (self.is_sparse() || src.is_sparse()) {
diff --git a/benchmarks/operator_benchmark/pt/tensor_to_test.py b/benchmarks/operator_benchmark/pt/tensor_to_test.py
new file mode 100644
index 0000000000000..7f4c440c2c391
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py
@@ -0,0 +1,39 @@
+import operator_benchmark as op_bench
+import torch
+
+tensor_conversion_short_configs = op_bench.cross_product_configs(
+    M=(8, 16, 32,),
+    N=(16, 64, 128,),
+    device=['cpu', 'cuda'],
+    tags=['short'],
+)
+
+tensor_conversion_long_configs = op_bench.cross_product_configs(
+    M=(64, 128, 256, 512,),
+    N=(256, 512, 1024, 2048,),
+    device=['cpu', 'cuda'],
+    tags=['long'],
+)
+
+class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, device):
+        self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.float)
+
+    def forward(self):
+        return self.input.to(torch.half)
+
+class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, device):
+        self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.half)
+
+    def forward(self):
+        return self.input.to(torch.float)
+
+
+op_bench.generate_pt_test(tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark)
+op_bench.generate_pt_test(tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark)
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/test/test_torch.py b/test/test_torch.py
index ba9492c500f4c..8d6e4c13ad11a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2951,20 +2951,34 @@ def test_parsing_intlist(self):
                                    lambda: torch.tensor().new_zeros((5, 5), 0))
 
         def test_half_tensor(self):
-            x = torch.randn(5, 5).float()
-            y = torch.randn(5, 5).float()
-            xh, yh = x.half(), y.half()
-
-            self.assertEqual(x.half().float(), x, atol=1e-3, rtol=0)
+            devices = ["cpu"]
+            if torch.cuda.is_available():
+                devices.append("cuda")
 
-            z = torch.Tensor(5, 5)
-            self.assertEqual(z.copy_(xh), x, atol=1e-3, rtol=0)
+            # contiguous tensor
+            # non-contiguous tensor
+            # dense non-overlapping tensor
+            # non-dense non-overlapping sliced tensor
+            # non-dense overlapping equal strides
+            for device in devices:
+                tset = (
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float).contiguous(),
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float).transpose(0, 1),
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float),
+                    torch.randn(4, 3, 2, device=device, dtype=torch.float)[:, :, ::2],
+                    torch.empty_strided(
+                        (4, 2, 3), (10, 3, 3), device=device, dtype=torch.float
+                    ).copy_(torch.rand((4, 2, 3), dtype=torch.float, device=device)),
+                )
 
-            with tempfile.NamedTemporaryFile() as f:
-                torch.save(xh, f)
-                f.seek(0)
-                xh2 = torch.load(f)
-                self.assertEqual(xh.float(), xh2.float())
+                for x in tset:
+                    self.assertEqual(x.half().float(), x, atol=1e-3, rtol=0)
+                    xh = x.half()
+                    with tempfile.NamedTemporaryFile() as f:
+                        torch.save(xh, f)
+                        f.seek(0)
+                        xh2 = torch.load(f)
+                        self.assertEqual(xh.float(), xh2.float())
 
         def test_from_buffer(self):
             a = bytearray([1, 2, 3, 4])
@@ -17347,8 +17361,11 @@ def _test_copysign_numpy(a, b):
             # Use double copysign to verify the correctnes of 0.0 and -0.0, since
             # it always True for self.assertEqual(0.0 == -0.0). So, we use 1 as the
             # magnitude to verify the sign between torch and numpy results, elementwise.
-            self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result),
-                             torch.copysign(torch.tensor(1.0), expected))
+            # Special case: NaN conversions between FP32 and FP16 is not bitwise
+            # equivalent to pass this assertion.
+            if a.dtype != torch.float16 and b.dtype != torch.float16:
+                self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result),
+                                 torch.copysign(torch.tensor(1.0), expected))
 
         # Compare Result with NumPy
         # Type promotion