From 3c21c9d019fe4fdb62c286606f323f3549ea9b4d Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Wed, 23 Sep 2020 13:44:27 +0800
Subject: [PATCH] Wrap Array1<T> as torch::Tensor. (#173)

* Wrap Array1<T> as torch::Tensor.

Fix k2host test cases.

* interpret arc.weight from a float to an int.

* update the comment for torch.h/torch.cu

* fix linker errors for release build.
---
 .flake8                                       |   8 +
 .github/workflows/style_check.yml             |   7 +-
 CMakeLists.txt                                |   2 +-
 cmake/pybind11.cmake                          |   6 +
 k2/csrc/CMakeLists.txt                        |   2 +-
 k2/csrc/default_context.cu                    |   4 +-
 k2/csrc/host/CMakeLists.txt                   |   2 +-
 k2/csrc/host/fsa.cc                           |   3 +-
 k2/csrc/pytorch_context.cu                    |  23 ++
 k2/csrc/pytorch_context.h                     |  39 ++-
 k2/python/CMakeLists.txt                      |   1 +
 k2/python/csrc/CMakeLists.txt                 |  33 +-
 k2/python/csrc/aux_labels.h                   |  14 -
 k2/python/csrc/fsa_algo.h                     |  14 -
 k2/python/csrc/fsa_equivalent.h               |  14 -
 k2/python/csrc/fsa_util.h                     |  14 -
 k2/python/csrc/k2.cc                          |  30 --
 k2/python/csrc/k2.cu                          |  18 +
 k2/python/csrc/k2.h                           |  15 +-
 k2/python/csrc/properties.h                   |  14 -
 k2/python/csrc/torch.cu                       |  23 ++
 k2/python/csrc/torch.h                        |  18 +
 k2/python/csrc/torch/CMakeLists.txt           |  12 +
 k2/python/csrc/torch/array.cu                 |  77 ++++
 k2/python/csrc/torch/array.h                  |  18 +
 k2/python/csrc/torch/torch_util.cu            |  27 ++
 k2/python/csrc/torch/torch_util.h             |  67 ++++
 k2/python/csrc/weights.h                      |  14 -
 k2/python/host/CMakeLists.txt                 |   2 +
 k2/python/host/csrc/CMakeLists.txt            |  16 +
 k2/python/{ => host}/csrc/CPPLINT.cfg         |   0
 k2/python/{ => host}/csrc/README.md           |   0
 k2/python/{ => host}/csrc/array.cc            |   5 +-
 k2/python/{ => host}/csrc/array.h             |  10 +-
 k2/python/{ => host}/csrc/aux_labels.cc       |   4 +-
 k2/python/host/csrc/aux_labels.h              |  14 +
 k2/python/{ => host}/csrc/dlpack.h            |   0
 k2/python/{ => host}/csrc/fsa.cc              |   7 +-
 k2/python/{ => host}/csrc/fsa.h               |  10 +-
 k2/python/{ => host}/csrc/fsa_algo.cc         |   6 +-
 k2/python/host/csrc/fsa_algo.h                |  14 +
 k2/python/{ => host}/csrc/fsa_equivalent.cc   |   4 +-
 k2/python/host/csrc/fsa_equivalent.h          |  14 +
 k2/python/{ => host}/csrc/fsa_util.cc         |   4 +-
 k2/python/host/csrc/fsa_util.h                |  14 +
 k2/python/host/csrc/k2.cc                     |  30 ++
 k2/python/host/csrc/k2.h                      |  16 +
 k2/python/{ => host}/csrc/properties.cc       |   4 +-
 k2/python/host/csrc/properties.h              |  14 +
 k2/python/{ => host}/csrc/tensor.cc           |   4 +-
 k2/python/{ => host}/csrc/tensor.h            |  12 +-
 k2/python/{ => host}/csrc/weights.cc          |   4 +-
 k2/python/host/csrc/weights.h                 |  14 +
 k2/python/host/k2host/__init__.py             |  10 +
 k2/python/host/k2host/array.py                | 107 ++++++
 k2/python/{k2 => host/k2host}/aux_labels.py   |   8 +-
 k2/python/{k2 => host/k2host}/fsa.py          |  25 +-
 k2/python/{k2 => host/k2host}/fsa_algo.py     |  48 +--
 .../{k2 => host/k2host}/fsa_equivalent.py     |  32 +-
 k2/python/{k2 => host/k2host}/fsa_util.py     |  15 +-
 k2/python/{k2 => host/k2host}/properties.py   |  18 +-
 k2/python/{k2 => host/k2host}/weights.py      |  10 +-
 k2/python/host/tests/CMakeLists.txt           |  36 ++
 k2/python/host/tests/arcsort_test.py          | 110 ++++++
 k2/python/host/tests/array_test.py            | 107 ++++++
 k2/python/{ => host}/tests/aux_labels_test.py | 152 ++++----
 k2/python/host/tests/connect_test.py          | 132 +++++++
 k2/python/host/tests/determinize_test.py      | 105 ++++++
 .../{ => host}/tests/fsa_equivalent_test.py   | 144 ++++----
 k2/python/{ => host}/tests/fsa_test.py        |  38 +-
 k2/python/host/tests/intersect_test.py        |  91 +++++
 k2/python/host/tests/properties_test.py       | 330 ++++++++++++++++++
 k2/python/host/tests/rmepsilon_test.py        | 110 ++++++
 k2/python/host/tests/topsort_test.py          | 110 ++++++
 k2/python/{ => host}/tests/weights_test.py    |  31 +-
 k2/python/k2/__init__.py                      |  13 +-
 k2/python/k2/array.py                         | 158 +++------
 k2/python/tests/CMakeLists.txt                |  13 +-
 k2/python/tests/arcsort_test.py               | 101 ------
 k2/python/tests/array_test.py                 | 186 +++++-----
 k2/python/tests/connect_test.py               | 131 -------
 k2/python/tests/determinize_test.py           | 106 ------
 k2/python/tests/intersect_test.py             |  90 -----
 k2/python/tests/properties_test.py            | 330 ------------------
 k2/python/tests/rmepsilon_test.py             | 110 ------
 k2/python/tests/topsort_test.py               | 109 ------
 86 files changed, 2226 insertions(+), 1631 deletions(-)
 delete mode 100644 k2/python/csrc/aux_labels.h
 delete mode 100644 k2/python/csrc/fsa_algo.h
 delete mode 100644 k2/python/csrc/fsa_equivalent.h
 delete mode 100644 k2/python/csrc/fsa_util.h
 delete mode 100644 k2/python/csrc/k2.cc
 create mode 100644 k2/python/csrc/k2.cu
 delete mode 100644 k2/python/csrc/properties.h
 create mode 100644 k2/python/csrc/torch.cu
 create mode 100644 k2/python/csrc/torch.h
 create mode 100644 k2/python/csrc/torch/CMakeLists.txt
 create mode 100644 k2/python/csrc/torch/array.cu
 create mode 100644 k2/python/csrc/torch/array.h
 create mode 100644 k2/python/csrc/torch/torch_util.cu
 create mode 100644 k2/python/csrc/torch/torch_util.h
 delete mode 100644 k2/python/csrc/weights.h
 create mode 100644 k2/python/host/CMakeLists.txt
 create mode 100644 k2/python/host/csrc/CMakeLists.txt
 rename k2/python/{ => host}/csrc/CPPLINT.cfg (100%)
 rename k2/python/{ => host}/csrc/README.md (100%)
 rename k2/python/{ => host}/csrc/array.cc (98%)
 rename k2/python/{ => host}/csrc/array.h (53%)
 rename k2/python/{ => host}/csrc/aux_labels.cc (95%)
 create mode 100644 k2/python/host/csrc/aux_labels.h
 rename k2/python/{ => host}/csrc/dlpack.h (100%)
 rename k2/python/{ => host}/csrc/fsa.cc (95%)
 rename k2/python/{ => host}/csrc/fsa.h (53%)
 rename k2/python/{ => host}/csrc/fsa_algo.cc (97%)
 create mode 100644 k2/python/host/csrc/fsa_algo.h
 rename k2/python/{ => host}/csrc/fsa_equivalent.cc (96%)
 create mode 100644 k2/python/host/csrc/fsa_equivalent.h
 rename k2/python/{ => host}/csrc/fsa_util.cc (77%)
 create mode 100644 k2/python/host/csrc/fsa_util.h
 create mode 100644 k2/python/host/csrc/k2.cc
 create mode 100644 k2/python/host/csrc/k2.h
 rename k2/python/{ => host}/csrc/properties.cc (93%)
 create mode 100644 k2/python/host/csrc/properties.h
 rename k2/python/{ => host}/csrc/tensor.cc (98%)
 rename k2/python/{ => host}/csrc/tensor.h (91%)
 rename k2/python/{ => host}/csrc/weights.cc (95%)
 create mode 100644 k2/python/host/csrc/weights.h
 create mode 100644 k2/python/host/k2host/__init__.py
 create mode 100644 k2/python/host/k2host/array.py
 rename k2/python/{k2 => host/k2host}/aux_labels.py (91%)
 rename k2/python/{k2 => host/k2host}/fsa.py (70%)
 rename k2/python/{k2 => host/k2host}/fsa_algo.py (71%)
 rename k2/python/{k2 => host/k2host}/fsa_equivalent.py (62%)
 rename k2/python/{k2 => host/k2host}/fsa_util.py (79%)
 rename k2/python/{k2 => host/k2host}/properties.py (75%)
 rename k2/python/{k2 => host/k2host}/weights.py (71%)
 create mode 100644 k2/python/host/tests/CMakeLists.txt
 create mode 100644 k2/python/host/tests/arcsort_test.py
 create mode 100644 k2/python/host/tests/array_test.py
 rename k2/python/{ => host}/tests/aux_labels_test.py (55%)
 create mode 100644 k2/python/host/tests/connect_test.py
 create mode 100644 k2/python/host/tests/determinize_test.py
 rename k2/python/{ => host}/tests/fsa_equivalent_test.py (53%)
 rename k2/python/{ => host}/tests/fsa_test.py (55%)
 create mode 100644 k2/python/host/tests/intersect_test.py
 create mode 100644 k2/python/host/tests/properties_test.py
 create mode 100644 k2/python/host/tests/rmepsilon_test.py
 create mode 100644 k2/python/host/tests/topsort_test.py
 rename k2/python/{ => host}/tests/weights_test.py (67%)
 delete mode 100644 k2/python/tests/arcsort_test.py
 delete mode 100644 k2/python/tests/connect_test.py
 delete mode 100644 k2/python/tests/determinize_test.py
 delete mode 100644 k2/python/tests/intersect_test.py
 delete mode 100644 k2/python/tests/properties_test.py
 delete mode 100644 k2/python/tests/rmepsilon_test.py
 delete mode 100644 k2/python/tests/topsort_test.py
diff --git a/.flake8 b/.flake8
index cc56566f7..636561126 100644
--- a/.flake8
+++ b/.flake8
@@ -2,3 +2,11 @@
 show-source=true
 statistics=true
 max-line-length=80
+exclude =
+  .git,
+  build,
+  k2/python/host
+
+ignore =
+  # E127 continuation line over-indented for visual indent
+  E127,
diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml
index 34752981a..aab9c07d1 100644
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.5, 3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
 
     steps:
       - uses: actions/checkout@v2
@@ -32,7 +32,7 @@ jobs:
       - name: Install Python dependencies
         run: |
           python3 -m pip install --upgrade pip
-          python3 -m pip install --upgrade flake8
+          python3 -m pip install --upgrade flake8==3.8.3
 
       - name: Run flake8
         shell: bash
@@ -40,8 +40,7 @@ jobs:
         run: |
           # stop the build if there are Python syntax errors or undefined names
           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings.
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=79 --statistics
+          flake8 .
 
           # TODO(fangjun): build a docker for style check
           #      - name: Install cppcheck
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00bc05437..178f342dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,10 +97,10 @@ enable_testing()
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
 include(pybind11)
 if(USE_PYTORCH)
+  add_definitions(-DK2_USE_PYTORCH)
   include(torch)
 endif()
 include(cub)
 include(googletest)
 
-
 add_subdirectory(k2)
diff --git a/cmake/pybind11.cmake b/cmake/pybind11.cmake
index 78cb28a1e..36b480a7c 100644
--- a/cmake/pybind11.cmake
+++ b/cmake/pybind11.cmake
@@ -11,9 +11,15 @@ function(download_pybind11)
   set(pybind11_URL  "https://github.com/pybind/pybind11/archive/v2.5.0.tar.gz")
   set(pybind11_HASH "SHA256=97504db65640570f32d3fdf701c25a340c8643037c3b69aec469c10c93dc8504")
 
+  set(double_quotes "\"")
+  set(dollar "\$")
+  set(semicolon "\;")
   FetchContent_Declare(pybind11
     URL               ${pybind11_URL}
     URL_HASH          ${pybind11_HASH}
+    PATCH_COMMAND
+      sed -i s/\\${double_quotes}-flto\\\\${dollar}/\\${double_quotes}-Xcompiler=-flto${dollar}/g "tools/pybind11Tools.cmake" &&
+      sed -i s/${seimcolon}-fno-fat-lto-objects/${seimcolon}-Xcompiler=-fno-fat-lto-objects/g "tools/pybind11Tools.cmake"
   )
 
   FetchContent_GetProperties(pybind11)
diff --git a/k2/csrc/CMakeLists.txt b/k2/csrc/CMakeLists.txt
index f63c783bb..177fb2b31 100644
--- a/k2/csrc/CMakeLists.txt
+++ b/k2/csrc/CMakeLists.txt
@@ -28,7 +28,7 @@ else()
 endif()
 
 # the target
-add_library(context STATIC ${context_srcs})
+add_library(context SHARED ${context_srcs})
 set_target_properties(context PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 # lib deps
diff --git a/k2/csrc/default_context.cu b/k2/csrc/default_context.cu
index 8e60bde6a..80d84cc56 100644
--- a/k2/csrc/default_context.cu
+++ b/k2/csrc/default_context.cu
@@ -39,7 +39,7 @@ class CpuContext : public Context {
       int32_t ret = posix_memalign(&p, kAlignment, bytes);
       K2_CHECK_EQ(ret, 0);
     }
-    if (deleter_context) *deleter_context = nullptr;
+    if (deleter_context != nullptr) *deleter_context = nullptr;
     return p;
   }
 
@@ -75,7 +75,7 @@ class CudaContext : public Context {
       auto ret = cudaMalloc(&p, bytes);
       K2_CHECK_CUDA_ERROR(ret);
     }
-    if (deleter_context) *deleter_context = nullptr;
+    if (deleter_context != nullptr) *deleter_context = nullptr;
     return p;
   }
 
diff --git a/k2/csrc/host/CMakeLists.txt b/k2/csrc/host/CMakeLists.txt
index 3616d2358..640ab85f5 100644
--- a/k2/csrc/host/CMakeLists.txt
+++ b/k2/csrc/host/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 # the target
 # please sort the source files alphabetically
-add_library(fsa
+add_library(fsa SHARED
     arcsort.cc
     aux_labels.cc
     connect.cc
diff --git a/k2/csrc/host/fsa.cc b/k2/csrc/host/fsa.cc
index 78f436949..85ca278d2 100644
--- a/k2/csrc/host/fsa.cc
+++ b/k2/csrc/host/fsa.cc
@@ -30,7 +30,8 @@ inline std::size_t AlignTo(std::size_t b, std::size_t alignment) {
 namespace k2host {
 
 std::ostream &operator<<(std::ostream &os, const Arc &arc) {
-  os << arc.src_state << " " << arc.dest_state << " " << arc.label;
+  os << arc.src_state << " " << arc.dest_state << " " << arc.label << " "
+     << arc.weight;
   return os;
 }
 
diff --git a/k2/csrc/pytorch_context.cu b/k2/csrc/pytorch_context.cu
index 25bd5dd91..304108203 100644
--- a/k2/csrc/pytorch_context.cu
+++ b/k2/csrc/pytorch_context.cu
@@ -23,4 +23,27 @@ ContextPtr GetCudaContext(int32_t gpu_id /*= -1*/) {
   return std::make_shared<PytorchCudaContext>(gpu_id);
 }
 
+RegionPtr NewRegion(torch::Tensor &tensor) {
+  auto ans = std::make_shared<Region>();
+  if (tensor.device().type() == torch::kCPU) {
+    ans->context = GetCpuContext();
+  } else if (tensor.is_cuda()) {
+    ans->context = GetCudaContext(tensor.device().index());
+  } else {
+    K2_LOG(FATAL) << "Unsupported device: " << tensor.device()
+                  << "\nOnly CPU and CUDA are supported";
+  }
+
+  // NOTE: the tensor is passed from Python and we have
+  // to retain it to avoid potential segmentation fault.
+  //
+  // It will be freed in `Context::Deallocate`.
+  auto *managed_tensor = new ManagedTensor(tensor);
+  ans->data = tensor.data_ptr();
+  ans->deleter_context = managed_tensor;
+  ans->num_bytes = tensor.nbytes();
+  ans->bytes_used = ans->num_bytes;
+  return ans;
+}
+
 }  // namespace k2
diff --git a/k2/csrc/pytorch_context.h b/k2/csrc/pytorch_context.h
index 9d4f4de57..3f7682045 100644
--- a/k2/csrc/pytorch_context.h
+++ b/k2/csrc/pytorch_context.h
@@ -16,12 +16,21 @@
 #include <memory>
 
 #include "c10/cuda/CUDACachingAllocator.h"
+#include "c10/cuda/CUDAFunctions.h"
 #include "k2/csrc/context.h"
 #include "k2/csrc/log.h"
 #include "torch/torch.h"
 
 namespace k2 {
 
+class ManagedTensor {
+ public:
+  explicit ManagedTensor(torch::Tensor &tensor) : handle_(tensor) {}
+
+ private:
+  torch::Tensor handle_;  // retain a copy of the tensor passed from Python
+};
+
 class PytorchCpuContext : public Context {
  private:
   PytorchCpuContext() {
@@ -46,12 +55,18 @@ class PytorchCpuContext : public Context {
 
   void *Allocate(std::size_t bytes, void **deleter_context) override {
     void *p = allocator_->raw_allocate(bytes);
-    if (deleter_context) *deleter_context = nullptr;
+    if (deleter_context != nullptr) *deleter_context = nullptr;
     return p;
   }
 
-  void Deallocate(void *data, void * /*deleter_context*/) override {
-    allocator_->raw_deallocate(data);
+  void Deallocate(void *data, void *deleter_context) override {
+    if (deleter_context != nullptr) {
+      // a non-empty `deleter_context` indicates that
+      // the memory is passed from a `torch::Tensor`
+      delete reinterpret_cast<ManagedTensor *>(deleter_context);
+    } else {
+      allocator_->raw_deallocate(data);
+    }
   }
 
   bool IsCompatible(const Context &other) const override {
@@ -94,12 +109,18 @@ class PytorchCudaContext : public Context {
 
   void *Allocate(std::size_t bytes, void **deleter_context) override {
     void *p = allocator_->raw_allocate(bytes);
-    if (deleter_context) *deleter_context = nullptr;
+    if (deleter_context != nullptr) *deleter_context = nullptr;
     return p;
   }
 
-  void Deallocate(void *data, void * /*deleter_context*/) override {
-    allocator_->raw_deallocate(data);
+  void Deallocate(void *data, void *deleter_context) override {
+    if (deleter_context != nullptr) {
+      // a non-empty `deleter_context` indicates that
+      // the memory is passed from a `torch::Tensor`
+      delete reinterpret_cast<ManagedTensor *>(deleter_context);
+    } else {
+      allocator_->raw_deallocate(data);
+    }
   }
 
   bool IsCompatible(const Context &other) const override {
@@ -116,6 +137,12 @@ class PytorchCudaContext : public Context {
   int32_t gpu_id_;
 };
 
+// Construct a region from a `torch::Tensor`.
+//
+// The resulting region shares the underlying memory with
+// the given tensor.
+RegionPtr NewRegion(torch::Tensor &tensor);
+
 }  // namespace k2
 
 #endif  // K2_CSRC_PYTORCH_CONTEXT_H_
diff --git a/k2/python/CMakeLists.txt b/k2/python/CMakeLists.txt
index 60d6382f6..64f7b3d23 100644
--- a/k2/python/CMakeLists.txt
+++ b/k2/python/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(csrc)
 add_subdirectory(tests)
+add_subdirectory(host)
diff --git a/k2/python/csrc/CMakeLists.txt b/k2/python/csrc/CMakeLists.txt
index e32f0205b..ee0da564b 100644
--- a/k2/python/csrc/CMakeLists.txt
+++ b/k2/python/csrc/CMakeLists.txt
@@ -1,16 +1,23 @@
-# please sort the files alphabetically
-pybind11_add_module(_k2
-  array.cc
-  aux_labels.cc
-  fsa.cc
-  fsa_algo.cc
-  fsa_equivalent.cc
-  fsa_util.cc
-  k2.cc
-  properties.cc
-  tensor.cc
-  weights.cc
+# please keep the list sorted
+set(k2_srcs
+  k2.cu
+  torch.cu
 )
 
-target_include_directories(_k2 PRIVATE ${CMAKE_SOURCE_DIR})
+if(USE_PYTORCH)
+  add_definitions(-DTORCH_API_INCLUDE_EXTENSION_H)
+  add_subdirectory(torch)
+  set(k2_srcs ${k2_srcs} ${torch_srcs})
+  set(k2_deps
+    ${TORCH_LIBRARIES}
+    ${TORCH_DIR}/lib/libtorch_python.so
+  )
+else()
+  message(FATAL_ERROR "Please select a framework.")
+endif()
+
+pybind11_add_module(_k2 ${k2_srcs})
+target_link_libraries(_k2 PRIVATE ${k2_deps})
+target_link_libraries(_k2 PRIVATE context)
 target_link_libraries(_k2 PRIVATE fsa)
+target_include_directories(_k2 PRIVATE ${CMAKE_SOURCE_DIR})
diff --git a/k2/python/csrc/aux_labels.h b/k2/python/csrc/aux_labels.h
deleted file mode 100644
index e2f0883e1..000000000
--- a/k2/python/csrc/aux_labels.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// k2/python/csrc/aux_labels.h
-
-// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#ifndef K2_PYTHON_CSRC_AUX_LABELS_H_
-#define K2_PYTHON_CSRC_AUX_LABELS_H_
-
-#include "k2/python/csrc/k2.h"
-
-void PybindAuxLabels(py::module &m);
-
-#endif  // K2_PYTHON_CSRC_AUX_LABELS_H_
diff --git a/k2/python/csrc/fsa_algo.h b/k2/python/csrc/fsa_algo.h
deleted file mode 100644
index 531f86590..000000000
--- a/k2/python/csrc/fsa_algo.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// k2/python/csrc/fsa_algo.h
-
-// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#ifndef K2_PYTHON_CSRC_FSA_ALGO_H_
-#define K2_PYTHON_CSRC_FSA_ALGO_H_
-
-#include "k2/python/csrc/k2.h"
-
-void PybindFsaAlgo(py::module &m);
-
-#endif  // K2_PYTHON_CSRC_FSA_ALGO_H_
diff --git a/k2/python/csrc/fsa_equivalent.h b/k2/python/csrc/fsa_equivalent.h
deleted file mode 100644
index 11c1c8aa5..000000000
--- a/k2/python/csrc/fsa_equivalent.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// k2/python/csrc/fsa_equivalent.h
-
-// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#ifndef K2_PYTHON_CSRC_FSA_EQUIVALENT_H_
-#define K2_PYTHON_CSRC_FSA_EQUIVALENT_H_
-
-#include "k2/python/csrc/k2.h"
-
-void PybindFsaEquivalent(py::module &m);
-
-#endif  // K2_PYTHON_CSRC_FSA_EQUIVALENT_H_
diff --git a/k2/python/csrc/fsa_util.h b/k2/python/csrc/fsa_util.h
deleted file mode 100644
index c97d8b8a8..000000000
--- a/k2/python/csrc/fsa_util.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// k2/python/csrc/fsa_util.h
-
-// Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#ifndef K2_PYTHON_CSRC_FSA_UTIL_H_
-#define K2_PYTHON_CSRC_FSA_UTIL_H_
-
-#include "k2/python/csrc/k2.h"
-
-void PybindFsaUtil(py::module &m);
-
-#endif  // K2_PYTHON_CSRC_FSA_UTIL_H_
diff --git a/k2/python/csrc/k2.cc b/k2/python/csrc/k2.cc
deleted file mode 100644
index 5968ce227..000000000
--- a/k2/python/csrc/k2.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// k2/python/csrc/k2.cc
-
-// Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#include "k2/python/csrc/k2.h"
-
-#include "k2/python/csrc/array.h"
-#include "k2/python/csrc/aux_labels.h"
-#include "k2/python/csrc/fsa.h"
-#include "k2/python/csrc/fsa_algo.h"
-#include "k2/python/csrc/fsa_equivalent.h"
-#include "k2/python/csrc/fsa_util.h"
-#include "k2/python/csrc/properties.h"
-#include "k2/python/csrc/weights.h"
-
-PYBIND11_MODULE(_k2, m) {
-  m.doc() = "pybind11 binding of k2";
-  PybindArc(m);
-  PybindArray(m);
-  PybindArray2Size(m);
-  PybindFsa(m);
-  PybindFsaUtil(m);
-  PybindFsaAlgo(m);
-  PybindFsaEquivalent(m);
-  PybindProperties(m);
-  PybindAuxLabels(m);
-  PybindWeights(m);
-}
diff --git a/k2/python/csrc/k2.cu b/k2/python/csrc/k2.cu
new file mode 100644
index 000000000..1467a4912
--- /dev/null
+++ b/k2/python/csrc/k2.cu
@@ -0,0 +1,18 @@
+/**
+ * @brief python wrappers for k2.
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#include "k2/python/csrc/k2.h"
+
+#include "k2/python/csrc/torch.h"
+
+PYBIND11_MODULE(_k2, m) {
+  m.doc() = "pybind11 binding of k2";
+  PybindTorch(m);
+}
diff --git a/k2/python/csrc/k2.h b/k2/python/csrc/k2.h
index 764299d32..2aaac5489 100644
--- a/k2/python/csrc/k2.h
+++ b/k2/python/csrc/k2.h
@@ -1,15 +1,18 @@
-// k2/python/csrc/k2.h
-
-// Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
-
-// See ../../../LICENSE for clarification regarding multiple authors
+/**
+ * @brief python wrappers for k2.
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
 
 #ifndef K2_PYTHON_CSRC_K2_H_
 #define K2_PYTHON_CSRC_K2_H_
 
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-#include "k2/csrc/log.h"
 
 namespace py = pybind11;
 
diff --git a/k2/python/csrc/properties.h b/k2/python/csrc/properties.h
deleted file mode 100644
index 04cd2148e..000000000
--- a/k2/python/csrc/properties.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// k2/python/csrc/properties.h
-
-// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#ifndef K2_PYTHON_CSRC_PROPERTIES_H_
-#define K2_PYTHON_CSRC_PROPERTIES_H_
-
-#include "k2/python/csrc/k2.h"
-
-void PybindProperties(py::module &m);
-
-#endif  // K2_PYTHON_CSRC_PROPERTIES_H_
diff --git a/k2/python/csrc/torch.cu b/k2/python/csrc/torch.cu
new file mode 100644
index 000000000..5ecd1d170
--- /dev/null
+++ b/k2/python/csrc/torch.cu
@@ -0,0 +1,23 @@
+/**
+ * @brief Everything related to PyTorch for k2 Python wrappers.
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#include "k2/python/csrc/torch.h"
+
+#if defined(K2_USE_PYTORCH)
+
+#include "k2/python/csrc/torch/array.h"
+
+void PybindTorch(py::module &m) { PybindArray(m); }
+
+#else
+
+void PybindTorch(py::module &) {}
+
+#endif
diff --git a/k2/python/csrc/torch.h b/k2/python/csrc/torch.h
new file mode 100644
index 000000000..1f85b2d1b
--- /dev/null
+++ b/k2/python/csrc/torch.h
@@ -0,0 +1,18 @@
+/**
+ * @brief Everything related to PyTorch for k2 Python wrappers.
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#ifndef K2_PYTHON_CSRC_TORCH_H_
+#define K2_PYTHON_CSRC_TORCH_H_
+
+#include "k2/python/csrc/k2.h"
+
+void PybindTorch(py::module &m);
+
+#endif  // K2_PYTHON_CSRC_TORCH_H_
diff --git a/k2/python/csrc/torch/CMakeLists.txt b/k2/python/csrc/torch/CMakeLists.txt
new file mode 100644
index 000000000..166e8e0b8
--- /dev/null
+++ b/k2/python/csrc/torch/CMakeLists.txt
@@ -0,0 +1,12 @@
+# please keep the list sorted
+set(torch_srcs
+  array.cu
+  torch_util.cu
+)
+
+set(torch_srcs_with_prefix)
+foreach(src IN LISTS torch_srcs)
+  list(APPEND torch_srcs_with_prefix "torch/${src}")
+endforeach()
+
+set(torch_srcs ${torch_srcs_with_prefix} PARENT_SCOPE)
diff --git a/k2/python/csrc/torch/array.cu b/k2/python/csrc/torch/array.cu
new file mode 100644
index 000000000..5ef86dd97
--- /dev/null
+++ b/k2/python/csrc/torch/array.cu
@@ -0,0 +1,77 @@
+/**
+ * @brief python wrappers for Array.
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#include <type_traits>
+
+#include "c10/core/ScalarType.h"
+#include "k2/csrc/array.h"
+#include "k2/csrc/pytorch_context.h"
+#include "k2/python/csrc/torch/array.h"
+#include "k2/python/csrc/torch/torch_util.h"
+#include "torch/extension.h"
+
+namespace k2 {
+
+template <typename T>
+static void PybindArray1Tpl(py::module &m, const char *name) {
+  using PyClass = Array1<T>;
+  py::class_<PyClass> pyclass(m, name);
+  pyclass.def(py::init<>());
+  pyclass.def("tensor", [](PyClass &self) { return ToTensor(self); });
+
+  pyclass.def_static(
+      "from_tensor",
+      [](torch::Tensor &tensor) { return FromTensor<T>(tensor); },
+      py::arg("tensor"));
+
+  // the following functions are for testing only
+  pyclass.def(
+      "get", [](const PyClass &self, int32_t i) { return self[i]; },
+      py::arg("i"));
+  pyclass.def("__str__", [](const PyClass &self) {
+    std::ostringstream os;
+    os << self;
+    return os.str();
+  });
+}
+
+static void PybindArrayImpl(py::module &m) {
+  // users should not use classes with prefix `_` in Python.
+  PybindArray1Tpl<float>(m, "_FloatArray1");
+  PybindArray1Tpl<int>(m, "_Int32Array1");
+
+  // the following functions are for testing purposes
+  // and they can be removed later.
+  m.def("get_cpu_float_array1", []() {
+    return Array1<float>(GetCpuContext(), {1, 2, 3, 4});
+  });
+
+  m.def("get_cpu_int_array1", []() {
+    return Array1<int32_t>(GetCpuContext(), {1, 2, 3, 4});
+  });
+
+  m.def(
+      "get_cuda_float_array1",
+      [](int32_t gpu_id = -1) {
+        return Array1<float>(GetCudaContext(gpu_id), {0, 1, 2, 3});
+      },
+      py::arg("gpu_id") = -1);
+
+  m.def(
+      "get_cuda_int_array1",
+      [](int32_t gpu_id = -1) {
+        return Array1<int32_t>(GetCudaContext(gpu_id), {0, 1, 2, 3});
+      },
+      py::arg("gpu_id") = -1);
+}
+
+}  // namespace k2
+
+void PybindArray(py::module &m) { k2::PybindArrayImpl(m); }
diff --git a/k2/python/csrc/torch/array.h b/k2/python/csrc/torch/array.h
new file mode 100644
index 000000000..eef25e043
--- /dev/null
+++ b/k2/python/csrc/torch/array.h
@@ -0,0 +1,18 @@
+/**
+ * @brief python wrappers for Array.
+ *
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#ifndef K2_PYTHON_CSRC_TORCH_ARRAY_H_
+#define K2_PYTHON_CSRC_TORCH_ARRAY_H_
+
+#include "k2/python/csrc/k2.h"
+
+void PybindArray(py::module &m);
+
+#endif  // K2_PYTHON_CSRC_TORCH_ARRAY_H_
diff --git a/k2/python/csrc/torch/torch_util.cu b/k2/python/csrc/torch/torch_util.cu
new file mode 100644
index 000000000..db70fc86a
--- /dev/null
+++ b/k2/python/csrc/torch/torch_util.cu
@@ -0,0 +1,27 @@
+/**
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#include "k2/python/csrc/torch/torch_util.h"
+#include "torch/extension.h"
+
+namespace k2 {
+
+torch::DeviceType ToTorchDeviceType(DeviceType type) {
+  switch (type) {
+    case kCuda:
+      return torch::kCUDA;
+    case kCpu:
+      return torch::kCPU;
+    case kUnk:  // fall-through
+    default:
+      K2_LOG(FATAL) << "kUnk is not supported!";
+      return torch::kCPU;  // unreachable code
+  }
+}
+
+}  // namespace k2
diff --git a/k2/python/csrc/torch/torch_util.h b/k2/python/csrc/torch/torch_util.h
new file mode 100644
index 000000000..b1acfdb6a
--- /dev/null
+++ b/k2/python/csrc/torch/torch_util.h
@@ -0,0 +1,67 @@
+/**
+ * @copyright
+ * Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+ *
+ * @copyright
+ * See LICENSE for clarification regarding multiple authors
+ */
+
+#ifndef K2_PYTHON_CSRC_TORCH_TORCH_UTIL_H_
+#define K2_PYTHON_CSRC_TORCH_TORCH_UTIL_H_
+
+#include "k2/csrc/array.h"
+#include "k2/csrc/log.h"
+#include "k2/csrc/pytorch_context.h"
+#include "torch/extension.h"
+
+namespace k2 {
+
+torch::DeviceType ToTorchDeviceType(DeviceType type);
+
+// Some versions of PyTorch do not have `c10::CppTypeToScalarType`,
+// so we implement our own here.
+template <typename T>
+struct ToScalarType;
+
+#define TO_SCALAR_TYPE(cpp_type, scalar_type) \
+  template <>                                 \
+  struct ToScalarType<cpp_type>               \
+      : std::integral_constant<torch::ScalarType, scalar_type> {};
+
+// TODO(fangjun): add other types if needed
+TO_SCALAR_TYPE(float, torch::kFloat);
+TO_SCALAR_TYPE(int, torch::kInt);
+
+#undef TO_SCALAR_TYPE
+
+template <typename T>
+torch::Tensor ToTensor(Array1<T> &array) {
+  auto device_type = ToTorchDeviceType(array.Context()->GetDeviceType());
+  int32_t device_id = array.Context()->GetDeviceId();
+  auto device = torch::Device(device_type, device_id);
+  auto scalar_type = ToScalarType<T>::value;
+  auto options = torch::device(device).dtype(scalar_type);
+
+  // NOTE: we keep a copy of `array` inside the lambda
+  // so that `torch::Tensor` always accesses valid memory.
+  return torch::from_blob(
+      array.Data(), array.Dim(), [array](void *p) {}, options);
+}
+
+template <typename T>
+Array1<T> FromTensor(torch::Tensor &tensor) {
+  K2_CHECK_EQ(tensor.dim(), 1) << "Expected dim: 1. Given: " << tensor.dim();
+  K2_CHECK_EQ(tensor.scalar_type(), ToScalarType<T>::value)
+      << "Expected scalar type: " << ToScalarType<T>::value
+      << ". Given: " << tensor.scalar_type();
+  K2_CHECK_EQ(tensor.strides()[0], 1)
+      << "Expected stride: 1. Given: " << tensor.strides()[0];
+
+  auto region = NewRegion(tensor);
+  Array1<T> ans(tensor.numel(), region, 0);
+  return ans;
+}
+
+}  // namespace k2
+
+#endif  // K2_PYTHON_CSRC_TORCH_TORCH_UTIL_H_
diff --git a/k2/python/csrc/weights.h b/k2/python/csrc/weights.h
deleted file mode 100644
index 7c1bd04cd..000000000
--- a/k2/python/csrc/weights.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// k2/python/csrc/weights.h
-
-// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-
-// See ../../../LICENSE for clarification regarding multiple authors
-
-#ifndef K2_PYTHON_CSRC_WEIGHTS_H_
-#define K2_PYTHON_CSRC_WEIGHTS_H_
-
-#include "k2/python/csrc/k2.h"
-
-void PybindWeights(py::module &m);
-
-#endif  // K2_PYTHON_CSRC_WEIGHTS_H_
diff --git a/k2/python/host/CMakeLists.txt b/k2/python/host/CMakeLists.txt
new file mode 100644
index 000000000..60d6382f6
--- /dev/null
+++ b/k2/python/host/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(csrc)
+add_subdirectory(tests)
diff --git a/k2/python/host/csrc/CMakeLists.txt b/k2/python/host/csrc/CMakeLists.txt
new file mode 100644
index 000000000..b6bc530d1
--- /dev/null
+++ b/k2/python/host/csrc/CMakeLists.txt
@@ -0,0 +1,16 @@
+# please sort the files alphabetically
+pybind11_add_module(_k2host
+  array.cc
+  aux_labels.cc
+  fsa.cc
+  fsa_algo.cc
+  fsa_equivalent.cc
+  fsa_util.cc
+  k2.cc
+  properties.cc
+  tensor.cc
+  weights.cc
+)
+
+target_include_directories(_k2host PRIVATE ${CMAKE_SOURCE_DIR})
+target_link_libraries(_k2host PRIVATE fsa)
diff --git a/k2/python/csrc/CPPLINT.cfg b/k2/python/host/csrc/CPPLINT.cfg
similarity index 100%
rename from k2/python/csrc/CPPLINT.cfg
rename to k2/python/host/csrc/CPPLINT.cfg
diff --git a/k2/python/csrc/README.md b/k2/python/host/csrc/README.md
similarity index 100%
rename from k2/python/csrc/README.md
rename to k2/python/host/csrc/README.md
diff --git a/k2/python/csrc/array.cc b/k2/python/host/csrc/array.cc
similarity index 98%
rename from k2/python/csrc/array.cc
rename to k2/python/host/csrc/array.cc
index 3f9ffa418..04aee9a92 100644
--- a/k2/python/csrc/array.cc
+++ b/k2/python/host/csrc/array.cc
@@ -1,15 +1,16 @@
+// k2/python/host/csrc/array.cc
 // Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/array.h"
+#include "k2/python/host/csrc/array.h"
 
 #include <memory>
 #include <utility>
 
 #include "k2/csrc/host/array.h"
 #include "k2/csrc/host/determinize_impl.h"
-#include "k2/python/csrc/tensor.h"
+#include "k2/python/host/csrc/tensor.h"
 
 namespace k2host {
 
diff --git a/k2/python/csrc/array.h b/k2/python/host/csrc/array.h
similarity index 53%
rename from k2/python/csrc/array.h
rename to k2/python/host/csrc/array.h
index 14d5d267f..76cba74d1 100644
--- a/k2/python/csrc/array.h
+++ b/k2/python/host/csrc/array.h
@@ -1,15 +1,15 @@
-// k2/python/csrc/array.h
+// k2/python/host/csrc/array.h
 
 // Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#ifndef K2_PYTHON_CSRC_ARRAY_H_
-#define K2_PYTHON_CSRC_ARRAY_H_
+#ifndef K2_PYTHON_HOST_CSRC_ARRAY_H_
+#define K2_PYTHON_HOST_CSRC_ARRAY_H_
 
-#include "k2/python/csrc/k2.h"
+#include "k2/python/host/csrc/k2.h"
 
 void PybindArray(py::module &m);
 void PybindArray2Size(py::module &m);
 
-#endif  // K2_PYTHON_CSRC_ARRAY_H_
+#endif  // K2_PYTHON_HOST_CSRC_ARRAY_H_
diff --git a/k2/python/csrc/aux_labels.cc b/k2/python/host/csrc/aux_labels.cc
similarity index 95%
rename from k2/python/csrc/aux_labels.cc
rename to k2/python/host/csrc/aux_labels.cc
index b73b03676..c4db30f20 100644
--- a/k2/python/csrc/aux_labels.cc
+++ b/k2/python/host/csrc/aux_labels.cc
@@ -1,10 +1,10 @@
-// k2/python/csrc/aux_labels.cc
+// k2/python/host/csrc/aux_labels.cc
 
 // Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/aux_labels.h"
+#include "k2/python/host/csrc/aux_labels.h"
 
 #include "k2/csrc/host/aux_labels.h"
 
diff --git a/k2/python/host/csrc/aux_labels.h b/k2/python/host/csrc/aux_labels.h
new file mode 100644
index 000000000..b331c5e42
--- /dev/null
+++ b/k2/python/host/csrc/aux_labels.h
@@ -0,0 +1,14 @@
+// k2/python/host/csrc/aux_labels.h
+
+// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_AUX_LABELS_H_
+#define K2_PYTHON_HOST_CSRC_AUX_LABELS_H_
+
+#include "k2/python/host/csrc/k2.h"
+
+void PybindAuxLabels(py::module &m);
+
+#endif  // K2_PYTHON_HOST_CSRC_AUX_LABELS_H_
diff --git a/k2/python/csrc/dlpack.h b/k2/python/host/csrc/dlpack.h
similarity index 100%
rename from k2/python/csrc/dlpack.h
rename to k2/python/host/csrc/dlpack.h
diff --git a/k2/python/csrc/fsa.cc b/k2/python/host/csrc/fsa.cc
similarity index 95%
rename from k2/python/csrc/fsa.cc
rename to k2/python/host/csrc/fsa.cc
index 3f13a5bb2..35b9564ae 100644
--- a/k2/python/csrc/fsa.cc
+++ b/k2/python/host/csrc/fsa.cc
@@ -1,17 +1,17 @@
-// k2/python/csrc/fsa.cc
+// k2/python/host/csrc/fsa.cc
 
 // Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
 //                      Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/fsa.h"
+#include "k2/python/host/csrc/fsa.h"
 
 #include <memory>
 #include <sstream>
 
 #include "k2/csrc/host/fsa.h"
-#include "k2/python/csrc/tensor.h"
+#include "k2/python/host/csrc/tensor.h"
 
 namespace k2host {
 
@@ -58,6 +58,7 @@ void PybindArc(py::module &m) {
       .def_readwrite("src_state", &PyClass::src_state)
       .def_readwrite("dest_state", &PyClass::dest_state)
       .def_readwrite("label", &PyClass::label)
+      .def_readwrite("weight", &PyClass::weight)
       .def("__str__", [](const PyClass &self) {
         std::ostringstream os;
         os << self;
diff --git a/k2/python/csrc/fsa.h b/k2/python/host/csrc/fsa.h
similarity index 53%
rename from k2/python/csrc/fsa.h
rename to k2/python/host/csrc/fsa.h
index 9090a76c8..10f891649 100644
--- a/k2/python/csrc/fsa.h
+++ b/k2/python/host/csrc/fsa.h
@@ -1,15 +1,15 @@
-// k2/python/csrc/fsa.h
+// k2/python/host/csrc/fsa.h
 
 // Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#ifndef K2_PYTHON_CSRC_FSA_H_
-#define K2_PYTHON_CSRC_FSA_H_
+#ifndef K2_PYTHON_HOST_CSRC_FSA_H_
+#define K2_PYTHON_HOST_CSRC_FSA_H_
 
-#include "k2/python/csrc/k2.h"
+#include "k2/python/host/csrc/k2.h"
 
 void PybindArc(py::module &m);
 void PybindFsa(py::module &m);
 
-#endif  // K2_PYTHON_CSRC_FSA_H_
+#endif  // K2_PYTHON_HOST_CSRC_FSA_H_
diff --git a/k2/python/csrc/fsa_algo.cc b/k2/python/host/csrc/fsa_algo.cc
similarity index 97%
rename from k2/python/csrc/fsa_algo.cc
rename to k2/python/host/csrc/fsa_algo.cc
index 205d6470f..a3618633a 100644
--- a/k2/python/csrc/fsa_algo.cc
+++ b/k2/python/host/csrc/fsa_algo.cc
@@ -1,10 +1,10 @@
-// k2/python/csrc/fsa_algo.cc
+// k2/python/host/csrc/fsa_algo.cc
 
 // Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/fsa_algo.h"
+#include "k2/python/host/csrc/fsa_algo.h"
 
 #include <memory>
 #include <utility>
@@ -19,7 +19,7 @@
 #include "k2/csrc/host/rmepsilon.h"
 #include "k2/csrc/host/topsort.h"
 #include "k2/csrc/host/weights.h"
-#include "k2/python/csrc/array.h"
+#include "k2/python/host/csrc/array.h"
 
 void PyBindArcSort(py::module &m) {
   using PyClass = k2host::ArcSorter;
diff --git a/k2/python/host/csrc/fsa_algo.h b/k2/python/host/csrc/fsa_algo.h
new file mode 100644
index 000000000..34a294a79
--- /dev/null
+++ b/k2/python/host/csrc/fsa_algo.h
@@ -0,0 +1,14 @@
+// k2/python/host/csrc/fsa_algo.h
+
+// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_FSA_ALGO_H_
+#define K2_PYTHON_HOST_CSRC_FSA_ALGO_H_
+
+#include "k2/python/host/csrc/k2.h"
+
+void PybindFsaAlgo(py::module &m);
+
+#endif  // K2_PYTHON_HOST_CSRC_FSA_ALGO_H_
diff --git a/k2/python/csrc/fsa_equivalent.cc b/k2/python/host/csrc/fsa_equivalent.cc
similarity index 96%
rename from k2/python/csrc/fsa_equivalent.cc
rename to k2/python/host/csrc/fsa_equivalent.cc
index 8281a2f7b..eb0917c17 100644
--- a/k2/python/csrc/fsa_equivalent.cc
+++ b/k2/python/host/csrc/fsa_equivalent.cc
@@ -1,10 +1,10 @@
-// k2/python/csrc/fsa_equivalent.cc
+// k2/python/host/csrc/fsa_equivalent.cc
 
 // Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/fsa_equivalent.h"
+#include "k2/python/host/csrc/fsa_equivalent.h"
 
 #include "k2/csrc/host/array.h"
 #include "k2/csrc/host/fsa_equivalent.h"
diff --git a/k2/python/host/csrc/fsa_equivalent.h b/k2/python/host/csrc/fsa_equivalent.h
new file mode 100644
index 000000000..c0b9ea4b9
--- /dev/null
+++ b/k2/python/host/csrc/fsa_equivalent.h
@@ -0,0 +1,14 @@
+// k2/python/host/csrc/fsa_equivalent.h
+
+// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_FSA_EQUIVALENT_H_
+#define K2_PYTHON_HOST_CSRC_FSA_EQUIVALENT_H_
+
+#include "k2/python/host/csrc/k2.h"
+
+void PybindFsaEquivalent(py::module &m);
+
+#endif  // K2_PYTHON_HOST_CSRC_FSA_EQUIVALENT_H_
diff --git a/k2/python/csrc/fsa_util.cc b/k2/python/host/csrc/fsa_util.cc
similarity index 77%
rename from k2/python/csrc/fsa_util.cc
rename to k2/python/host/csrc/fsa_util.cc
index 4e3161d10..b4d6bd6d9 100644
--- a/k2/python/csrc/fsa_util.cc
+++ b/k2/python/host/csrc/fsa_util.cc
@@ -1,10 +1,10 @@
-// k2/python/csrc/fsa_util.cc
+// k2/python/host/csrc/fsa_util.cc
 
 // Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/fsa_util.h"
+#include "k2/python/host/csrc/fsa_util.h"
 
 #include "k2/csrc/host/fsa_util.h"
 
diff --git a/k2/python/host/csrc/fsa_util.h b/k2/python/host/csrc/fsa_util.h
new file mode 100644
index 000000000..f685f3d13
--- /dev/null
+++ b/k2/python/host/csrc/fsa_util.h
@@ -0,0 +1,14 @@
+// k2/python/host/csrc/fsa_util.h
+
+// Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_FSA_UTIL_H_
+#define K2_PYTHON_HOST_CSRC_FSA_UTIL_H_
+
+#include "k2/python/host/csrc/k2.h"
+
+void PybindFsaUtil(py::module &m);
+
+#endif  // K2_PYTHON_HOST_CSRC_FSA_UTIL_H_
diff --git a/k2/python/host/csrc/k2.cc b/k2/python/host/csrc/k2.cc
new file mode 100644
index 000000000..47fb04b5e
--- /dev/null
+++ b/k2/python/host/csrc/k2.cc
@@ -0,0 +1,30 @@
+// k2/python/host/csrc/k2.cc
+
+// Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#include "k2/python/host/csrc/k2.h"
+
+#include "k2/python/host/csrc/array.h"
+#include "k2/python/host/csrc/aux_labels.h"
+#include "k2/python/host/csrc/fsa.h"
+#include "k2/python/host/csrc/fsa_algo.h"
+#include "k2/python/host/csrc/fsa_equivalent.h"
+#include "k2/python/host/csrc/fsa_util.h"
+#include "k2/python/host/csrc/properties.h"
+#include "k2/python/host/csrc/weights.h"
+
+PYBIND11_MODULE(_k2host, m) {
+  m.doc() = "pybind11 binding of k2host";
+  PybindArc(m);
+  PybindArray(m);
+  PybindArray2Size(m);
+  PybindFsa(m);
+  PybindFsaUtil(m);
+  PybindFsaAlgo(m);
+  PybindFsaEquivalent(m);
+  PybindProperties(m);
+  PybindAuxLabels(m);
+  PybindWeights(m);
+}
diff --git a/k2/python/host/csrc/k2.h b/k2/python/host/csrc/k2.h
new file mode 100644
index 000000000..40831ffff
--- /dev/null
+++ b/k2/python/host/csrc/k2.h
@@ -0,0 +1,16 @@
+// k2/python/host/csrc/k2.h
+
+// Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_K2_H_
+#define K2_PYTHON_HOST_CSRC_K2_H_
+
+#include "k2/csrc/log.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+#endif  // K2_PYTHON_HOST_CSRC_K2_H_
diff --git a/k2/python/csrc/properties.cc b/k2/python/host/csrc/properties.cc
similarity index 93%
rename from k2/python/csrc/properties.cc
rename to k2/python/host/csrc/properties.cc
index 7df721bb6..e73191233 100644
--- a/k2/python/csrc/properties.cc
+++ b/k2/python/host/csrc/properties.cc
@@ -4,14 +4,14 @@
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/properties.h"
+#include "k2/python/host/csrc/properties.h"
 
 #include <vector>
 
 #include "k2/csrc/host/array.h"
 #include "k2/csrc/host/fsa.h"
 #include "k2/csrc/host/properties.h"
-#include "k2/python/csrc/array.h"
+#include "k2/python/host/csrc/array.h"
 
 // We would never pass `order` parameter to k2host::IsAcyclic in Python code.
 // We can make it accept `None` with `std::optional` in pybind11, but
diff --git a/k2/python/host/csrc/properties.h b/k2/python/host/csrc/properties.h
new file mode 100644
index 000000000..0a5a057ea
--- /dev/null
+++ b/k2/python/host/csrc/properties.h
@@ -0,0 +1,14 @@
+// k2/python/host/csrc/properties.h
+
+// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_PROPERTIES_H_
+#define K2_PYTHON_HOST_CSRC_PROPERTIES_H_
+
+#include "k2/python/host/csrc/k2.h"
+
+void PybindProperties(py::module &m);
+
+#endif  // K2_PYTHON_HOST_CSRC_PROPERTIES_H_
diff --git a/k2/python/csrc/tensor.cc b/k2/python/host/csrc/tensor.cc
similarity index 98%
rename from k2/python/csrc/tensor.cc
rename to k2/python/host/csrc/tensor.cc
index f6a072ec8..be43424c3 100644
--- a/k2/python/csrc/tensor.cc
+++ b/k2/python/host/csrc/tensor.cc
@@ -1,11 +1,11 @@
-// k2/python/csrc/tensor.cc
+// k2/python/host/csrc/tensor.cc
 
 // Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
 //                      Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/tensor.h"
+#include "k2/python/host/csrc/tensor.h"
 
 namespace k2host {
 
diff --git a/k2/python/csrc/tensor.h b/k2/python/host/csrc/tensor.h
similarity index 91%
rename from k2/python/csrc/tensor.h
rename to k2/python/host/csrc/tensor.h
index f72014d18..7a544499b 100644
--- a/k2/python/csrc/tensor.h
+++ b/k2/python/host/csrc/tensor.h
@@ -1,14 +1,14 @@
-// k2/python/csrc/tensor.h
+// k2/python/host/csrc/tensor.h
 
 // Copyright (c)  2020  Fangjun Kuang (csukuangfj@gmail.com)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#ifndef K2_PYTHON_CSRC_TENSOR_H_
-#define K2_PYTHON_CSRC_TENSOR_H_
+#ifndef K2_PYTHON_HOST_CSRC_TENSOR_H_
+#define K2_PYTHON_HOST_CSRC_TENSOR_H_
 
-#include "k2/python/csrc/dlpack.h"
-#include "k2/python/csrc/k2.h"
+#include "k2/python/host/csrc/dlpack.h"
+#include "k2/python/host/csrc/k2.h"
 
 namespace k2host {
 
@@ -98,4 +98,4 @@ class Tensor {
 
 }  // namespace k2host
 
-#endif  // K2_PYTHON_CSRC_TENSOR_H_
+#endif  // K2_PYTHON_HOST_CSRC_TENSOR_H_
diff --git a/k2/python/csrc/weights.cc b/k2/python/host/csrc/weights.cc
similarity index 95%
rename from k2/python/csrc/weights.cc
rename to k2/python/host/csrc/weights.cc
index e1081a2c5..55d1e54a7 100644
--- a/k2/python/csrc/weights.cc
+++ b/k2/python/host/csrc/weights.cc
@@ -1,10 +1,10 @@
-// k2/python/csrc/weights.cc
+// k2/python/host/csrc/weights.cc
 
 // Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
 
 // See ../../../LICENSE for clarification regarding multiple authors
 
-#include "k2/python/csrc/weights.h"
+#include "k2/python/host/csrc/weights.h"
 
 #include <memory>
 
diff --git a/k2/python/host/csrc/weights.h b/k2/python/host/csrc/weights.h
new file mode 100644
index 000000000..fe59920ba
--- /dev/null
+++ b/k2/python/host/csrc/weights.h
@@ -0,0 +1,14 @@
+// k2/python/host/csrc/weights.h
+
+// Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+
+// See ../../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_PYTHON_HOST_CSRC_WEIGHTS_H_
+#define K2_PYTHON_HOST_CSRC_WEIGHTS_H_
+
+#include "k2/python/host/csrc/k2.h"
+
+void PybindWeights(py::module &m);
+
+#endif  // K2_PYTHON_HOST_CSRC_WEIGHTS_H_
diff --git a/k2/python/host/k2host/__init__.py b/k2/python/host/k2host/__init__.py
new file mode 100644
index 000000000..32b477121
--- /dev/null
+++ b/k2/python/host/k2host/__init__.py
@@ -0,0 +1,10 @@
+from _k2host import IntArray2Size
+from _k2host import FbWeightType
+from .array import *
+from .aux_labels import *
+from .fsa import *
+from .fsa_algo import *
+from .fsa_equivalent import *
+from .fsa_util import str_to_fsa
+from .properties import *
+from .weights import *
diff --git a/k2/python/host/k2host/array.py b/k2/python/host/k2host/array.py
new file mode 100644
index 000000000..821eba64f
--- /dev/null
+++ b/k2/python/host/k2host/array.py
@@ -0,0 +1,107 @@
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+
+# See ../../../LICENSE for clarification regarding multiple authors
+
+import torch
+from torch.utils.dlpack import to_dlpack
+
+from _k2host import IntArray2Size
+from _k2host import DLPackIntArray2
+from _k2host import DLPackIntArray1
+from _k2host import DLPackStridedIntArray1
+from _k2host import DLPackFloatArray1
+from _k2host import DLPackDoubleArray1
+from _k2host import DLPackLogSumArcDerivs
+
+
+class IntArray1(DLPackIntArray1):
+
+    def __init__(self, data: torch.Tensor, check_dtype: bool = True):
+        if check_dtype:
+            assert data.dtype == torch.int32
+        self.data = data
+        super().__init__(to_dlpack(self.data))
+
+    @staticmethod
+    def from_float_tensor(data: torch.Tensor) -> 'IntArray1':
+        assert data.dtype == torch.float
+        return IntArray1(data, False)
+
+    @staticmethod
+    def create_array_with_size(size: int) -> 'IntArray1':
+        data = torch.zeros(size, dtype=torch.int32)
+        return IntArray1(data)
+
+
+class StridedIntArray1(DLPackStridedIntArray1):
+
+    def __init__(self, data: torch.Tensor, check_dtype: bool = True):
+        if check_dtype:
+            assert data.dtype == torch.int32
+        self.data = data
+        super().__init__(to_dlpack(self.data))
+
+    @staticmethod
+    def from_float_tensor(data: torch.Tensor) -> 'StridedIntArray1':
+        assert data.dtype == torch.float
+        return StridedIntArray1(data, False)
+
+
+class FloatArray1(DLPackFloatArray1):
+
+    def __init__(self, data: torch.Tensor):
+        assert data.dtype == torch.float
+        self.data = data
+        super().__init__(to_dlpack(self.data))
+
+    @staticmethod
+    def create_array_with_size(size: int) -> 'FloatArray1':
+        data = torch.zeros(size, dtype=torch.float)
+        return FloatArray1(data)
+
+
+class DoubleArray1(DLPackDoubleArray1):
+
+    def __init__(self, data: torch.Tensor):
+        assert data.dtype == torch.double
+        self.data = data
+        super().__init__(to_dlpack(self.data))
+
+    @staticmethod
+    def create_array_with_size(size: int) -> 'DoubleArray1':
+        data = torch.zeros(size, dtype=torch.double)
+        return DoubleArray1(data)
+
+
+class IntArray2(DLPackIntArray2):
+
+    def __init__(self, indexes: torch.Tensor, data: torch.Tensor):
+        assert indexes.dtype == torch.int32
+        assert data.dtype == torch.int32
+        self.indexes = indexes
+        self.data = data
+        super().__init__(to_dlpack(self.indexes), to_dlpack(self.data))
+
+    @staticmethod
+    def create_array_with_size(array_size: IntArray2Size) -> 'IntArray2':
+        indexes = torch.zeros(array_size.size1 + 1, dtype=torch.int32)
+        data = torch.zeros(array_size.size2, dtype=torch.int32)
+        return IntArray2(indexes, data)
+
+
+class LogSumArcDerivs(DLPackLogSumArcDerivs):
+
+    def __init__(self, indexes: torch.Tensor, data: torch.Tensor):
+        assert indexes.dtype == torch.int32
+        assert data.dtype == torch.float32
+        assert data.shape[1] == 2
+        self.indexes = indexes
+        self.data = data
+        super().__init__(to_dlpack(self.indexes), to_dlpack(self.data))
+
+    @staticmethod
+    def create_arc_derivs_with_size(array_size: IntArray2Size
+                                   ) -> 'LogSumArcDerivs':
+        indexes = torch.zeros(array_size.size1 + 1, dtype=torch.int32)
+        data = torch.zeros([array_size.size2, 2], dtype=torch.float32)
+        return LogSumArcDerivs(indexes, data)
diff --git a/k2/python/k2/aux_labels.py b/k2/python/host/k2host/aux_labels.py
similarity index 91%
rename from k2/python/k2/aux_labels.py
rename to k2/python/host/k2host/aux_labels.py
index 91a8fe5bb..0b010cc1d 100644
--- a/k2/python/k2/aux_labels.py
+++ b/k2/python/host/k2host/aux_labels.py
@@ -5,10 +5,10 @@
 import torch
 from torch.utils.dlpack import to_dlpack
 
-from _k2 import IntArray2Size
-from _k2 import _AuxLabels1Mapper
-from _k2 import _AuxLabels2Mapper
-from _k2 import _FstInverter
+from _k2host import IntArray2Size
+from _k2host import _AuxLabels1Mapper
+from _k2host import _AuxLabels2Mapper
+from _k2host import _FstInverter
 
 from .fsa import Fsa
 from .array import IntArray1
diff --git a/k2/python/k2/fsa.py b/k2/python/host/k2host/fsa.py
similarity index 70%
rename from k2/python/k2/fsa.py
rename to k2/python/host/k2host/fsa.py
index 0ac10a610..a93c0409e 100644
--- a/k2/python/k2/fsa.py
+++ b/k2/python/host/k2host/fsa.py
@@ -5,24 +5,27 @@
 import torch
 from torch.utils.dlpack import to_dlpack
 
-from _k2 import IntArray2Size
-from _k2 import _Arc
-from _k2 import DLPackFsa
-from _k2 import IntArray2Size
+from _k2host import IntArray2Size
+from _k2host import _Arc
+from _k2host import DLPackFsa
+from _k2host import IntArray2Size
 
 
 class Arc(_Arc):
 
-    def __init__(self, src_state: int, dest_state: int, label: int):
-        super().__init__(src_state, dest_state, label)
+    def __init__(self, src_state: int, dest_state: int, label: int,
+                 weight: float):
+        super().__init__(src_state, dest_state, label, weight)
 
     def to_tensor(self):
-        return torch.tensor([self.src_state, self.dest_state, self.label],
-                            dtype=torch.int32)
+        # TODO(fangjun): weight will be truncted to an int.
+        return torch.tensor(
+            [self.src_state, self.dest_state, self.label, self.weight],
+            dtype=torch.int32)
 
     @staticmethod
     def from_tensor(tensor: torch.Tensor) -> 'Arc':
-        assert tensor.shape == torch.Size([3])
+        assert tensor.shape == torch.Size([4])
         assert tensor.dtype == torch.int32
         return Arc(*tensor.tolist())
 
@@ -41,7 +44,7 @@ class Fsa(DLPackFsa):
     def __init__(self, indexes: torch.Tensor, data: torch.Tensor):
         assert indexes.dtype == torch.int32
         assert data.dtype == torch.int32
-        assert data.shape[1] == 3
+        assert data.shape[1] == 4
         self.indexes = indexes
         self.data = data
         super().__init__(to_dlpack(self.indexes), to_dlpack(self.data))
@@ -49,5 +52,5 @@ def __init__(self, indexes: torch.Tensor, data: torch.Tensor):
     @staticmethod
     def create_fsa_with_size(array_size: IntArray2Size) -> 'Fsa':
         indexes = torch.zeros(array_size.size1 + 1, dtype=torch.int32)
-        data = torch.zeros([array_size.size2, 3], dtype=torch.int32)
+        data = torch.zeros([array_size.size2, 4], dtype=torch.int32)
         return Fsa(indexes, data)
diff --git a/k2/python/k2/fsa_algo.py b/k2/python/host/k2host/fsa_algo.py
similarity index 71%
rename from k2/python/k2/fsa_algo.py
rename to k2/python/host/k2host/fsa_algo.py
index f9423f97a..e286adf83 100644
--- a/k2/python/k2/fsa_algo.py
+++ b/k2/python/host/k2host/fsa_algo.py
@@ -11,16 +11,16 @@
 from .array import FloatArray1
 from .array import LogSumArcDerivs
 from .weights import WfsaWithFbWeights
-from _k2 import IntArray2Size
-from _k2 import _ArcSorter
-from _k2 import _arc_sort
-from _k2 import _TopSorter
-from _k2 import _Connection
-from _k2 import _Intersection
-from _k2 import _DeterminizerMax
-from _k2 import _DeterminizerLogSum
-from _k2 import _EpsilonsRemoverMax
-from _k2 import _EpsilonsRemoverLogSum
+from _k2host import IntArray2Size
+from _k2host import _ArcSorter
+from _k2host import _arc_sort
+from _k2host import _TopSorter
+from _k2host import _Connection
+from _k2host import _Intersection
+from _k2host import _DeterminizerMax
+from _k2host import _DeterminizerLogSum
+from _k2host import _EpsilonsRemoverMax
+from _k2host import _EpsilonsRemoverLogSum
 
 
 class ArcSorter(_ArcSorter):
@@ -97,11 +97,8 @@ def get_sizes(self, fsa_size: IntArray2Size,
                   arc_derivs_size: IntArray2Size) -> None:
         return super().get_sizes(fsa_size, arc_derivs_size)
 
-    def get_output(self, fsa_out: Fsa, arc_weights_out: FloatArray1,
-                   arc_derivs: IntArray2) -> float:
-        return super().get_output(fsa_out.get_base(),
-                                  arc_weights_out.get_base(),
-                                  arc_derivs.get_base())
+    def get_output(self, fsa_out: Fsa, arc_derivs: IntArray2) -> float:
+        return super().get_output(fsa_out.get_base(), arc_derivs.get_base())
 
 
 class DeterminizerLogSum(_DeterminizerLogSum):
@@ -113,11 +110,8 @@ def get_sizes(self, fsa_size: IntArray2Size,
                   arc_derivs_size: IntArray2Size) -> None:
         return super().get_sizes(fsa_size, arc_derivs_size)
 
-    def get_output(self, fsa_out: Fsa, arc_weights_out: FloatArray1,
-                   arc_derivs: LogSumArcDerivs) -> float:
-        return super().get_output(fsa_out.get_base(),
-                                  arc_weights_out.get_base(),
-                                  arc_derivs.get_base())
+    def get_output(self, fsa_out: Fsa, arc_derivs: LogSumArcDerivs) -> float:
+        return super().get_output(fsa_out.get_base(), arc_derivs.get_base())
 
 
 class EpsilonsRemoverMax(_EpsilonsRemoverMax):
@@ -129,11 +123,8 @@ def get_sizes(self, fsa_size: IntArray2Size,
                   arc_derivs_size: IntArray2Size) -> None:
         return super().get_sizes(fsa_size, arc_derivs_size)
 
-    def get_output(self, fsa_out: Fsa, arc_weights_out: FloatArray1,
-                   arc_derivs: IntArray2) -> None:
-        return super().get_output(fsa_out.get_base(),
-                                  arc_weights_out.get_base(),
-                                  arc_derivs.get_base())
+    def get_output(self, fsa_out: Fsa, arc_derivs: IntArray2) -> None:
+        return super().get_output(fsa_out.get_base(), arc_derivs.get_base())
 
 
 class EpsilonsRemoverLogSum(_EpsilonsRemoverLogSum):
@@ -145,8 +136,5 @@ def get_sizes(self, fsa_size: IntArray2Size,
                   arc_derivs_size: IntArray2Size) -> None:
         return super().get_sizes(fsa_size, arc_derivs_size)
 
-    def get_output(self, fsa_out: Fsa, arc_weights_out: FloatArray1,
-                   arc_derivs: LogSumArcDerivs) -> None:
-        return super().get_output(fsa_out.get_base(),
-                                  arc_weights_out.get_base(),
-                                  arc_derivs.get_base())
+    def get_output(self, fsa_out: Fsa, arc_derivs: LogSumArcDerivs) -> None:
+        return super().get_output(fsa_out.get_base(), arc_derivs.get_base())
diff --git a/k2/python/k2/fsa_equivalent.py b/k2/python/host/k2host/fsa_equivalent.py
similarity index 62%
rename from k2/python/k2/fsa_equivalent.py
rename to k2/python/host/k2host/fsa_equivalent.py
index 2a83c6056..b8608a5e2 100644
--- a/k2/python/k2/fsa_equivalent.py
+++ b/k2/python/host/k2host/fsa_equivalent.py
@@ -8,12 +8,12 @@
 from .fsa import Fsa
 from .array import IntArray1
 from .array import FloatArray1
-from _k2 import IntArray2Size
-from _k2 import _RandPath
-from _k2 import _is_rand_equivalent
-from _k2 import _is_rand_equivalent_max_weight
-from _k2 import _is_rand_equivalent_logsum_weight
-from _k2 import _is_rand_equivalent_after_rmeps_pruned_logsum
+from _k2host import IntArray2Size
+from _k2host import _RandPath
+from _k2host import _is_rand_equivalent
+from _k2host import _is_rand_equivalent_max_weight
+from _k2host import _is_rand_equivalent_logsum_weight
+from _k2host import _is_rand_equivalent_after_rmeps_pruned_logsum
 
 
 class RandPath(_RandPath):
@@ -35,42 +35,30 @@ def is_rand_equivalent(fsa_a: Fsa, fsa_b: Fsa, npath: int = 100) -> bool:
 
 
 def is_rand_equivalent_max_weight(fsa_a: Fsa,
-                                  a_weights: FloatArray1,
                                   fsa_b: Fsa,
-                                  b_weights: FloatArray1,
                                   beam: float = float('inf'),
                                   delta: float = 1e-6,
                                   top_sorted: bool = True,
                                   npath: int = 100) -> bool:
-    return _is_rand_equivalent_max_weight(fsa_a.get_base(),
-                                          a_weights.get_base(),
-                                          fsa_b.get_base(),
-                                          b_weights.get_base(), beam, delta,
-                                          top_sorted, npath)
+    return _is_rand_equivalent_max_weight(fsa_a.get_base(), fsa_b.get_base(),
+                                          beam, delta, top_sorted, npath)
 
 
 def is_rand_equivalent_logsum_weight(fsa_a: Fsa,
-                                     a_weights: FloatArray1,
                                      fsa_b: Fsa,
-                                     b_weights: FloatArray1,
                                      beam: float = float('inf'),
                                      delta: float = 1e-6,
                                      top_sorted: bool = True,
                                      npath: int = 100) -> bool:
     return _is_rand_equivalent_logsum_weight(fsa_a.get_base(),
-                                             a_weights.get_base(),
-                                             fsa_b.get_base(),
-                                             b_weights.get_base(), beam, delta,
+                                             fsa_b.get_base(), beam, delta,
                                              top_sorted, npath)
 
 
 def is_rand_equivalent_after_rmeps_pruned_logsum(fsa_a: Fsa,
-                                                 a_weights: FloatArray1,
                                                  fsa_b: Fsa,
-                                                 b_weights: FloatArray1,
                                                  beam: float,
                                                  top_sorted: bool = True,
                                                  npath: int = 100) -> bool:
     return _is_rand_equivalent_after_rmeps_pruned_logsum(
-        fsa_a.get_base(), a_weights.get_base(), fsa_b.get_base(),
-        b_weights.get_base(), beam, top_sorted, npath)
+        fsa_a.get_base(), fsa_b.get_base(), beam, top_sorted, npath)
diff --git a/k2/python/k2/fsa_util.py b/k2/python/host/k2host/fsa_util.py
similarity index 79%
rename from k2/python/k2/fsa_util.py
rename to k2/python/host/k2host/fsa_util.py
index d2cfbd595..d096dcf7b 100644
--- a/k2/python/k2/fsa_util.py
+++ b/k2/python/host/k2host/fsa_util.py
@@ -3,6 +3,7 @@
 # See ../../../LICENSE for clarification regarding multiple authors
 
 import re
+import struct
 from collections import defaultdict
 
 import torch
@@ -10,13 +11,18 @@
 from .fsa import Fsa
 
 
+def float_to_int(f):
+    f = struct.pack('f', f)
+    return int.from_bytes(f, 'little')
+
+
 def str_to_fsa(s: str) -> Fsa:
     '''Create an FSA from a string.
 
     The input string `s` consists of several lines; every line except the
     last line has the following format:
 
-        <src_state> <dest_state> <label>
+        <src_state> <dest_state> <label> <weight>
 
     The last line of `s` contains:
 
@@ -28,7 +34,7 @@ def str_to_fsa(s: str) -> Fsa:
         k2.Fsa
     '''
     rule_pattern = re.compile(
-        r'^[ \t]*(\d+)[ \t]+(\d+)[ \t]+([-]?\d+)[ \t]*$$')
+        r'^[ \t]*(\d+)[ \t]+(\d+)[ \t]+([-]?\d+)[ \t]+([-]?\d*[.]?\d+)[ \t]*$')
     final_state_pattern = re.compile(r'^[ \t]*(\d+)[ \t]*$')
     rules = s.strip().split('\n')
 
@@ -40,7 +46,10 @@ def str_to_fsa(s: str) -> Fsa:
             src_state = int(m.group(1))
             dest_state = int(m.group(2))
             label = int(m.group(3))
-            state_to_rules[src_state].append([src_state, dest_state, label])
+            weight = float(m.group(4))
+            weight = float_to_int(weight)
+            state_to_rules[src_state].append(
+                [src_state, dest_state, label, weight])
         else:
             m = final_state_pattern.match(r)
             assert m
diff --git a/k2/python/k2/properties.py b/k2/python/host/k2host/properties.py
similarity index 75%
rename from k2/python/k2/properties.py
rename to k2/python/host/k2host/properties.py
index 978fcc46f..6b09e9510 100644
--- a/k2/python/k2/properties.py
+++ b/k2/python/host/k2host/properties.py
@@ -6,15 +6,15 @@
 from torch.utils.dlpack import to_dlpack
 
 from .fsa import Fsa
-from _k2 import _is_valid
-from _k2 import _is_top_sorted
-from _k2 import _is_arc_sorted
-from _k2 import _has_self_loops
-from _k2 import _is_acyclic
-from _k2 import _is_deterministic
-from _k2 import _is_epsilon_free
-from _k2 import _is_connected
-from _k2 import _is_empty
+from _k2host import _is_valid
+from _k2host import _is_top_sorted
+from _k2host import _is_arc_sorted
+from _k2host import _has_self_loops
+from _k2host import _is_acyclic
+from _k2host import _is_deterministic
+from _k2host import _is_epsilon_free
+from _k2host import _is_connected
+from _k2host import _is_empty
 
 
 def is_valid(fsa: Fsa) -> bool:
diff --git a/k2/python/k2/weights.py b/k2/python/host/k2host/weights.py
similarity index 71%
rename from k2/python/k2/weights.py
rename to k2/python/host/k2host/weights.py
index 9ab1c8b46..be2832039 100644
--- a/k2/python/k2/weights.py
+++ b/k2/python/host/k2host/weights.py
@@ -5,9 +5,9 @@
 import torch
 from torch.utils.dlpack import to_dlpack
 
-from _k2 import IntArray2Size
-from _k2 import FbWeightType
-from _k2 import _WfsaWithFbWeights
+from _k2host import IntArray2Size
+from _k2host import FbWeightType
+from _k2host import _WfsaWithFbWeights
 
 from .fsa import Fsa
 from .array import IntArray1
@@ -18,9 +18,9 @@
 
 class WfsaWithFbWeights(_WfsaWithFbWeights):
 
-    def __init__(self, fsa: Fsa, arc_weights: FloatArray1, type: FbWeightType,
+    def __init__(self, fsa: Fsa, weight_type: FbWeightType,
                  forward_state_weights: DoubleArray1,
                  backward_state_weights: DoubleArray1):
-        super().__init__(fsa.get_base(), arc_weights.get_base(), type,
+        super().__init__(fsa.get_base(), weight_type,
                          forward_state_weights.get_base(),
                          backward_state_weights.get_base())
diff --git a/k2/python/host/tests/CMakeLists.txt b/k2/python/host/tests/CMakeLists.txt
new file mode 100644
index 000000000..73b6864fe
--- /dev/null
+++ b/k2/python/host/tests/CMakeLists.txt
@@ -0,0 +1,36 @@
+function(k2_add_py_test source)
+  get_filename_component(name ${source} NAME_WE)
+  set(name "host_${name}_py")
+
+  add_test(NAME ${name}
+    COMMAND
+      "${PYTHON_EXECUTABLE}"
+      "${CMAKE_CURRENT_SOURCE_DIR}/${source}"
+  )
+
+  get_filename_component(k2host_path ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
+
+  set_property(TEST ${name}
+    PROPERTY ENVIRONMENT "PYTHONPATH=$<TARGET_FILE_DIR:_k2host>:${k2host_path}:$ENV{PYTHONPATH}"
+  )
+endfunction()
+
+# please sort the files in alphabetic order
+set(py_test_files
+    arcsort_test.py
+    array_test.py
+    aux_labels_test.py
+    connect_test.py
+    determinize_test.py
+    fsa_equivalent_test.py
+    fsa_test.py
+    intersect_test.py
+    properties_test.py
+    rmepsilon_test.py
+    topsort_test.py
+    weights_test.py
+)
+
+foreach(source IN LISTS py_test_files)
+  k2_add_py_test(${source})
+endforeach()
diff --git a/k2/python/host/tests/arcsort_test.py b/k2/python/host/tests/arcsort_test.py
new file mode 100644
index 000000000..5b24cced7
--- /dev/null
+++ b/k2/python/host/tests/arcsort_test.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_arcsort_test_py
+#
+
+import unittest
+
+import torch
+
+import k2host
+from k2host.fsa_util import float_to_int
+
+
+class TestArcSort(unittest.TestCase):
+
+    def test_empty_fsa(self):
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(fsa.size2)
+        k2host.arc_sort(fsa, arc_map)
+        self.assertTrue(k2host.is_empty(fsa))
+        self.assertTrue(arc_map.empty())
+
+        # test without arc_map
+        k2host.arc_sort(fsa)
+        self.assertTrue(k2host.is_empty(fsa))
+
+    def test_arc_sort(self):
+        s = r'''
+        0 1 2 1
+        0 4 0 2
+        0 2 0 3
+        1 2 1 4
+        1 3 0 5
+        2 1 0 6
+        4
+        '''
+
+        fsa = k2host.str_to_fsa(s)
+        arc_map = k2host.IntArray1.create_array_with_size(fsa.size2)
+        k2host.arc_sort(fsa, arc_map)
+        expected_arc_indexes = torch.IntTensor([0, 3, 5, 6, 6, 6])
+        expected_arcs = torch.IntTensor([[0, 2, 0, float_to_int(3)],
+                                         [0, 4, 0, float_to_int(2)],
+                                         [0, 1, 2, float_to_int(1)],
+                                         [1, 3, 0, float_to_int(5)],
+                                         [1, 2, 1, float_to_int(4)],
+                                         [2, 1, 0, float_to_int(6)]])
+        expected_arc_map = torch.IntTensor([2, 1, 0, 4, 3, 5])
+        self.assertTrue(torch.equal(fsa.indexes, expected_arc_indexes))
+        self.assertTrue(torch.equal(fsa.data, expected_arcs))
+        self.assertTrue(torch.equal(arc_map.data, expected_arc_map))
+
+
+class TestArcSorter(unittest.TestCase):
+
+    def test_empty_fsa(self):
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        sorter = k2host.ArcSorter(fsa)
+        array_size = k2host.IntArray2Size()
+        sorter.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
+        sorter.get_output(fsa_out, arc_map)
+        self.assertTrue(k2host.is_empty(fsa))
+
+        # test without arc_map
+        sorter.get_output(fsa_out)
+        self.assertTrue(k2host.is_empty(fsa_out))
+
+    def test_arc_sort(self):
+        s = r'''
+        0 1 2 1
+        0 4 0 2
+        0 2 0 3
+        1 2 1 4
+        1 3 0 5
+        2 1 0 6
+        4
+        '''
+
+        fsa = k2host.str_to_fsa(s)
+        sorter = k2host.ArcSorter(fsa)
+        array_size = k2host.IntArray2Size()
+        sorter.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
+        sorter.get_output(fsa_out, arc_map)
+        expected_arc_indexes = torch.IntTensor([0, 3, 5, 6, 6, 6])
+        expected_arcs = torch.IntTensor([[0, 2, 0, float_to_int(3)],
+                                         [0, 4, 0, float_to_int(2)],
+                                         [0, 1, 2, float_to_int(1)],
+                                         [1, 3, 0, float_to_int(5)],
+                                         [1, 2, 1, float_to_int(4)],
+                                         [2, 1, 0, float_to_int(6)]])
+        expected_arc_map = torch.IntTensor([2, 1, 0, 4, 3, 5])
+        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
+        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
+        self.assertTrue(torch.equal(arc_map.data, expected_arc_map))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/host/tests/array_test.py b/k2/python/host/tests/array_test.py
new file mode 100644
index 000000000..e3b9f4d59
--- /dev/null
+++ b/k2/python/host/tests/array_test.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_array_test_py
+#
+
+from struct import pack, unpack
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestArray(unittest.TestCase):
+
+    def test_int_array1(self):
+        data = torch.arange(10).to(torch.int32)
+
+        array = k2host.IntArray1(data)
+        self.assertFalse(array.empty())
+        self.assertIsInstance(array, k2host.IntArray1)
+        self.assertEqual(data.numel(), array.size)
+        self.assertEqual(array.data[9], 9)
+
+        # the underlying memory is shared between k2host and torch;
+        # so change one will change another
+        data[0] = 100
+        self.assertEqual(array.data[0], 100)
+        self.assertEqual(array.get_data(0), 100)
+
+        del data
+        # the array in k2host is still accessible
+        self.assertEqual(array.data[0], 100)
+        self.assertEqual(array.get_data(0), 100)
+
+    def test_int_array2(self):
+        data = torch.arange(10).to(torch.int32)
+        indexes = torch.tensor([0, 2, 5, 6, 10]).to(torch.int32)
+        self.assertEqual(data.numel(), indexes[-1].item())
+
+        array = k2host.IntArray2(indexes, data)
+        self.assertFalse(array.empty())
+        self.assertIsInstance(array, k2host.IntArray2)
+
+        self.assertEqual(indexes.numel(), array.size1 + 1)
+        self.assertEqual(data.numel(), array.size2)
+        self.assertEqual(array.data[9], 9)
+
+        # the underlying memory is shared between k2host and torch;
+        # so change one will change another
+        data[0] = 100
+        self.assertEqual(array.data[0], 100)
+        self.assertEqual(array.get_data(0), 100)
+        indexes[1] = 3
+        self.assertEqual(array.indexes[1], 3)
+        self.assertEqual(array.get_indexes(1), 3)
+
+        del data
+        del indexes
+        # the array in k2host is still accessible
+        self.assertEqual(array.data[0], 100)
+        self.assertEqual(array.get_data(0), 100)
+        self.assertEqual(array.indexes[1], 3)
+        self.assertEqual(array.get_indexes(1), 3)
+
+    def test_logsum_arc_derivs(self):
+        data = torch.arange(10).reshape(5, 2).to(torch.float)
+        indexes = torch.tensor([0, 2, 3, 5]).to(torch.int32)
+        self.assertEqual(data.shape[0], indexes[-1].item())
+
+        array = k2host.LogSumArcDerivs(indexes, data)
+        self.assertFalse(array.empty())
+        self.assertIsInstance(array, k2host.LogSumArcDerivs)
+
+        self.assertEqual(indexes.numel(), array.size1 + 1)
+        self.assertEqual(data.shape[0], array.size2)
+        self.assertTrue(torch.equal(array.data[1], torch.FloatTensor([2, 3])))
+
+        # convert arc-ids in arc-derivs to IntArray
+        arc_ids = k2host.StridedIntArray1.from_float_tensor(array.data[:, 0])
+        # the underlying memory is shared between k2host and torch;
+        # so change one will change another
+        data[1] = torch.FloatTensor([100, 200])
+        self.assertTrue(
+            torch.equal(array.data[1], torch.FloatTensor([100, 200])))
+        self.assertEqual(array.get_data(1)[1], 200)
+        self.assertEqual(arc_ids.data[1], 100)
+        # we need pack and then unpack here to interpret arc_id (int) as a float,
+        # this is only for test purpose as users would usually never call
+        # `array.get_data` to retrieve data. Instead, it is supposed to call
+        # `array.data` to retrieve or update data in the array object.
+        arc_id = pack('i', array.get_data(1)[0])
+        self.assertEqual(unpack('f', arc_id)[0], 100)
+
+        del data
+        # the array in k2host is still accessible
+        self.assertEqual(array.get_data(1)[1], 200)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/tests/aux_labels_test.py b/k2/python/host/tests/aux_labels_test.py
similarity index 55%
rename from k2/python/tests/aux_labels_test.py
rename to k2/python/host/tests/aux_labels_test.py
index bfaba261d..2d7bd3623 100644
--- a/k2/python/tests/aux_labels_test.py
+++ b/k2/python/host/tests/aux_labels_test.py
@@ -6,14 +6,14 @@
 
 # To run this single test, use
 #
-#  ctest --verbose -R aux_labels_test_py
+#  ctest --verbose -R host_aux_labels_test_py
 #
 
 import unittest
 
 import torch
 
-import k2
+import k2host
 
 
 class TestAuxLabelsMapper(unittest.TestCase):
@@ -21,26 +21,26 @@ class TestAuxLabelsMapper(unittest.TestCase):
     def setUp(self):
         indexes = torch.IntTensor([0, 1, 3, 6, 7])
         data = torch.IntTensor([1, 2, 3, 4, 5, 6, 7])
-        self.aux_labels_in = k2.AuxLabels(indexes, data)
+        self.aux_labels_in = k2host.AuxLabels(indexes, data)
 
     def test_mapper1_case_1(self):
         # empty arc map
-        arc_map = k2.IntArray1.create_array_with_size(0)
-        mapper = k2.AuxLabels1Mapper(self.aux_labels_in, arc_map)
-        aux_size = k2.IntArray2Size()
+        arc_map = k2host.IntArray1.create_array_with_size(0)
+        mapper = k2host.AuxLabels1Mapper(self.aux_labels_in, arc_map)
+        aux_size = k2host.IntArray2Size()
         mapper.get_sizes(aux_size)
         self.assertEqual(aux_size.size1, 0)
         self.assertEqual(aux_size.size2, 0)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         mapper.get_output(labels_out)
         self.assertTrue(labels_out.empty())
 
     def test_mapper1_case_2(self):
-        arc_map = k2.IntArray1(torch.IntTensor([2, 0, 3]))
-        mapper = k2.AuxLabels1Mapper(self.aux_labels_in, arc_map)
-        aux_size = k2.IntArray2Size()
+        arc_map = k2host.IntArray1(torch.IntTensor([2, 0, 3]))
+        mapper = k2host.AuxLabels1Mapper(self.aux_labels_in, arc_map)
+        aux_size = k2host.IntArray2Size()
         mapper.get_sizes(aux_size)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         mapper.get_output(labels_out)
         self.assertEqual(aux_size.size1, 3)
         self.assertEqual(aux_size.size2, 5)
@@ -51,11 +51,11 @@ def test_mapper1_case_2(self):
 
     def test_mapper1_case_3(self):
         # all arcs in the input fsa remain.
-        arc_map = k2.IntArray1(torch.IntTensor([2, 0, 3, 1]))
-        mapper = k2.AuxLabels1Mapper(self.aux_labels_in, arc_map)
-        aux_size = k2.IntArray2Size()
+        arc_map = k2host.IntArray1(torch.IntTensor([2, 0, 3, 1]))
+        mapper = k2host.AuxLabels1Mapper(self.aux_labels_in, arc_map)
+        aux_size = k2host.IntArray2Size()
         mapper.get_sizes(aux_size)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         mapper.get_output(labels_out)
         self.assertEqual(aux_size.size1, 4)
         self.assertEqual(aux_size.size2, 7)
@@ -66,25 +66,25 @@ def test_mapper1_case_3(self):
 
     def test_mapper2_case_1(self):
         # empty arc map
-        array_size = k2.IntArray2Size(0, 0)
-        arc_map = k2.IntArray2.create_array_with_size(array_size)
-        mapper = k2.AuxLabels2Mapper(self.aux_labels_in, arc_map)
-        aux_size = k2.IntArray2Size()
+        array_size = k2host.IntArray2Size(0, 0)
+        arc_map = k2host.IntArray2.create_array_with_size(array_size)
+        mapper = k2host.AuxLabels2Mapper(self.aux_labels_in, arc_map)
+        aux_size = k2host.IntArray2Size()
         mapper.get_sizes(aux_size)
         self.assertEqual(aux_size.size1, 0)
         self.assertEqual(aux_size.size2, 0)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         mapper.get_output(labels_out)
         self.assertTrue(labels_out.empty())
 
     def test_mapper2_case_2(self):
         indexes = torch.IntTensor([0, 2, 4, 5, 6])
         data = torch.IntTensor([2, 3, 0, 1, 0, 2])
-        arc_map = k2.IntArray2(indexes, data)
-        mapper = k2.AuxLabels2Mapper(self.aux_labels_in, arc_map)
-        aux_size = k2.IntArray2Size()
+        arc_map = k2host.IntArray2(indexes, data)
+        mapper = k2host.AuxLabels2Mapper(self.aux_labels_in, arc_map)
+        aux_size = k2host.IntArray2Size()
         mapper.get_sizes(aux_size)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         mapper.get_output(labels_out)
         self.assertEqual(aux_size.size1, 4)
         self.assertEqual(aux_size.size2, 11)
@@ -98,55 +98,57 @@ class TestFstInverter(unittest.TestCase):
 
     def test_case_1(self):
         # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa_in = k2.Fsa.create_fsa_with_size(array_size)
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa_in = k2host.Fsa.create_fsa_with_size(array_size)
         indexes = torch.IntTensor([0, 1, 3, 6, 7])
         data = torch.IntTensor([1, 2, 3, 4, 5, 6, 7])
-        labels_in = k2.AuxLabels(indexes, data)
-        inverter = k2.FstInverter(fsa_in, labels_in)
-        fsa_size = k2.IntArray2Size()
-        aux_size = k2.IntArray2Size()
+        labels_in = k2host.AuxLabels(indexes, data)
+        inverter = k2host.FstInverter(fsa_in, labels_in)
+        fsa_size = k2host.IntArray2Size()
+        aux_size = k2host.IntArray2Size()
         inverter.get_sizes(fsa_size, aux_size)
         self.assertEqual(aux_size.size1, 0)
         self.assertEqual(aux_size.size2, 0)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         inverter.get_output(fsa_out, labels_out)
-        self.assertTrue(k2.is_empty(fsa_out))
+        self.assertTrue(k2host.is_empty(fsa_out))
         self.assertTrue(labels_out.empty())
 
     def test_case_2(self):
         # top-sorted input FSA
         s = r'''
-        0 1 1
-        0 1 0
-        0 3 2
-        1 2 3
-        1 3 4
-        1 5 -1
-        2 3 0
-        2 5 -1
-        4 5 -1
+        0 1 1 0
+        0 1 0 0
+        0 3 2 0
+        1 2 3 0
+        1 3 4 0
+        1 5 -1 0
+        2 3 0 0
+        2 5 -1 0
+        4 5 -1 0
         5
         '''
 
-        fsa_in = k2.str_to_fsa(s)
+        fsa_in = k2host.str_to_fsa(s)
         indexes = torch.IntTensor([0, 2, 3, 3, 6, 6, 7, 7, 8, 9])
         data = torch.IntTensor([1, 2, 3, 5, 6, 7, -1, -1, -1])
-        labels_in = k2.AuxLabels(indexes, data)
-        inverter = k2.FstInverter(fsa_in, labels_in)
-        fsa_size = k2.IntArray2Size()
-        aux_size = k2.IntArray2Size()
+        labels_in = k2host.AuxLabels(indexes, data)
+        inverter = k2host.FstInverter(fsa_in, labels_in)
+        fsa_size = k2host.IntArray2Size()
+        aux_size = k2host.IntArray2Size()
         inverter.get_sizes(fsa_size, aux_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         inverter.get_output(fsa_out, labels_out)
         expected_arc_indexes = torch.IntTensor(
             [0, 3, 4, 7, 8, 9, 11, 11, 12, 12])
-        expected_arcs = torch.IntTensor([[0, 1, 1], [0, 2, 3], [0, 6, 0],
-                                         [1, 2, 2], [2, 3, 5], [2, 6, 0],
-                                         [2, 8, -1], [3, 4, 6], [4, 5, 7],
-                                         [5, 6, 0], [5, 8, -1], [7, 8, -1]])
+        expected_arcs = torch.IntTensor([[0, 1, 1, 0], [0, 2, 3, 0],
+                                         [0, 6, 0, 0], [1, 2, 2, 0],
+                                         [2, 3, 5, 0], [2, 6, 0, 0],
+                                         [2, 8, -1, 0], [3, 4, 6, 0],
+                                         [4, 5, 7, 0], [5, 6, 0, 0],
+                                         [5, 8, -1, 0], [7, 8, -1, 0]])
         self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
         self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
         expected_label_indexes = torch.IntTensor(
@@ -159,36 +161,38 @@ def test_case_2(self):
     def test_case_3(self):
         # non-top-sorted input FSA
         s = r'''
-        0 1 1
-        0 1 0
-        0 3 2
-        1 2 3
-        1 3 4
-        2 1 5
-        2 5 -1
-        3 1 6
-        4 5 -1
+        0 1 1 0
+        0 1 0 0
+        0 3 2 0
+        1 2 3 0
+        1 3 4 0
+        2 1 5 0
+        2 5 -1 0
+        3 1 6 0
+        4 5 -1 0
         5
         '''
 
-        fsa_in = k2.str_to_fsa(s)
+        fsa_in = k2host.str_to_fsa(s)
         indexes = torch.IntTensor([0, 2, 3, 3, 6, 6, 7, 8, 10, 11])
         data = torch.IntTensor([1, 2, 3, 5, 6, 7, 8, -1, 9, 10, -1])
-        labels_in = k2.AuxLabels(indexes, data)
-        inverter = k2.FstInverter(fsa_in, labels_in)
-        fsa_size = k2.IntArray2Size()
-        aux_size = k2.IntArray2Size()
+        labels_in = k2host.AuxLabels(indexes, data)
+        inverter = k2host.FstInverter(fsa_in, labels_in)
+        fsa_size = k2host.IntArray2Size()
+        aux_size = k2host.IntArray2Size()
         inverter.get_sizes(fsa_size, aux_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        labels_out = k2.AuxLabels.create_array_with_size(aux_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        labels_out = k2host.AuxLabels.create_array_with_size(aux_size)
         inverter.get_output(fsa_out, labels_out)
         expected_arc_indexes = torch.IntTensor(
             [0, 3, 4, 5, 7, 8, 9, 11, 12, 13, 13])
-        expected_arcs = torch.IntTensor([[0, 1, 1], [0, 3, 3], [0, 7, 0],
-                                         [1, 3, 2], [2, 3, 10], [3, 4, 5],
-                                         [3, 7, 0], [4, 5, 6], [5, 6, 7],
-                                         [6, 3, 8], [6, 9, -1], [7, 2, 9],
-                                         [8, 9, -1]])
+        expected_arcs = torch.IntTensor([[0, 1, 1, 0], [0, 3, 3, 0],
+                                         [0, 7, 0, 0], [1, 3, 2, 0],
+                                         [2, 3, 10, 0], [3, 4, 5, 0],
+                                         [3, 7, 0, 0], [4, 5, 6, 0],
+                                         [5, 6, 7, 0], [6, 3, 8, 0],
+                                         [6, 9, -1, 0], [7, 2, 9, 0],
+                                         [8, 9, -1, 0]])
         self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
         self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
         expected_label_indexes = torch.IntTensor(
diff --git a/k2/python/host/tests/connect_test.py b/k2/python/host/tests/connect_test.py
new file mode 100644
index 000000000..42cf15db6
--- /dev/null
+++ b/k2/python/host/tests/connect_test.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_connect_test_py
+#
+
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestConnection(unittest.TestCase):
+
+    def test_case_1(self):
+        # a non-connected, non-topsorted, acyclic input fsa;
+        # the output fsa is topsorted.
+        s = r'''
+        0 1 1 0
+        0 2 2 0
+        1 3 3 0
+        1 6 -1 0
+        2 4 2 0
+        2 6 -1 0
+        2 1 1 0
+        5 0 1 0
+        6
+        '''
+        fsa = k2host.str_to_fsa(s)
+        connection = k2host.Connection(fsa)
+        array_size = k2host.IntArray2Size()
+        connection.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
+        status = connection.get_output(fsa_out, arc_map)
+        self.assertTrue(status)
+        expected_arc_indexes = torch.IntTensor([0, 2, 4, 5, 5])
+        expected_arcs = torch.IntTensor([[0, 2, 1, 0], [0, 1, 2, 0],
+                                         [1, 3, -1, 0], [1, 2, 1, 0],
+                                         [2, 3, -1, 0]])
+        expected_arc_map = torch.IntTensor([0, 1, 5, 6, 3])
+        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
+        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
+        self.assertTrue(torch.equal(arc_map.data, expected_arc_map))
+
+    def test_case_2(self):
+        # a cyclic input fsa
+        # after trimming, the cycle is removed;
+        # so the output fsa should be topsorted.
+        s = r'''
+        0 1 1 0
+        0 2 2 1
+        1 3 3 -2
+        1 6 6 -3
+        2 4 2 4
+        2 6 3 5
+        2 6 -1 6
+        5 0 1 7
+        5 7 -1 8
+        7
+        '''
+        fsa = k2host.str_to_fsa(s)
+        connection = k2host.Connection(fsa)
+        array_size = k2host.IntArray2Size()
+        connection.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
+        status = connection.get_output(fsa_out, arc_map)
+        self.assertTrue(status)
+        self.assertTrue(k2host.is_empty(fsa_out))
+        self.assertTrue(arc_map.empty())
+
+    def test_case_3(self):
+        # a non-connected, non-topsorted, acyclic input fsa;
+        # the output fsa is topsorted.
+        s = r'''
+        0 3 3 1
+        0 5 5 2
+        1 2 2 3
+        2 1 1 4
+        3 5 5 5
+        3 2 2 -6
+        3 4 4 7
+        3 6 -1 8
+        4 5 5 9
+        4 6 -1 10
+        5 6 -1 11
+        6
+        '''
+        fsa = k2host.str_to_fsa(s)
+        connection = k2host.Connection(fsa)
+        array_size = k2host.IntArray2Size()
+        connection.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        connection.get_output(fsa_out)
+        self.assertTrue(k2host.is_top_sorted(fsa_out))
+
+    def test_case_4(self):
+        # a cyclic input fsa
+        # after trimming, the cycle remains (it is not a self-loop);
+        # so the output fsa is NOT topsorted.
+        s = r'''
+        0 3 3 1
+        0 2 2 2
+        1 0 1 3
+        2 6 -1 4
+        3 5 5 5
+        3 2 2 6
+        3 5 5 7
+        4 4 4 8
+        5 3 3 9
+        5 4 4 10
+        6
+        '''
+        fsa = k2host.str_to_fsa(s)
+        connection = k2host.Connection(fsa)
+        array_size = k2host.IntArray2Size()
+        connection.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        status = connection.get_output(fsa_out)
+        self.assertFalse(status)
+        self.assertFalse(k2host.is_top_sorted(fsa_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/host/tests/determinize_test.py b/k2/python/host/tests/determinize_test.py
new file mode 100644
index 000000000..5f3cd1f8d
--- /dev/null
+++ b/k2/python/host/tests/determinize_test.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_determinize_test_py
+#
+
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestDeterminize(unittest.TestCase):
+
+    def setUp(self):
+        s = r'''
+        0 4 1 1
+        0 1 1 1
+        1 2 2 2
+        1 3 3 3
+        2 7 1 4
+        3 7 1 5
+        4 6 1 2
+        4 6 1 3
+        4 5 1 3
+        4 8 -1 2
+        5 8 -1 4
+        6 8 -1 3
+        7 8 -1 5
+        8
+        '''
+        self.fsa = k2host.str_to_fsa(s)
+        self.num_states = self.fsa.num_states()
+
+    def test_max_weight(self):
+        forward_max_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        backward_max_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        wfsa = k2host.WfsaWithFbWeights(self.fsa,
+                                        k2host.FbWeightType.kMaxWeight,
+                                        forward_max_weights,
+                                        backward_max_weights)
+        beam = 10.0
+        determinizer = k2host.DeterminizerMax(wfsa, beam, 100)
+        fsa_size = k2host.IntArray2Size()
+        arc_derivs_size = k2host.IntArray2Size()
+        determinizer.get_sizes(fsa_size, arc_derivs_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        arc_derivs = k2host.IntArray2.create_array_with_size(arc_derivs_size)
+        arc_weights_out = k2host.FloatArray1.create_array_with_size(
+            fsa_size.size2)
+        determinizer.get_output(fsa_out, arc_derivs)
+        self.assertTrue(k2host.is_deterministic(fsa_out))
+        self.assertEqual(fsa_out.size1, 7)
+        self.assertEqual(fsa_out.size2, 9)
+        self.assertEqual(arc_derivs.size1, 9)
+        self.assertEqual(arc_derivs.size2, 12)
+        self.assertTrue(
+            k2host.is_rand_equivalent_max_weight(self.fsa, fsa_out, beam))
+
+    def test_logsum_weight(self):
+        forward_logsum_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        backward_logsum_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        wfsa = k2host.WfsaWithFbWeights(self.fsa,
+                                        k2host.FbWeightType.kLogSumWeight,
+                                        forward_logsum_weights,
+                                        backward_logsum_weights)
+        beam = 10.0
+        determinizer = k2host.DeterminizerLogSum(wfsa, beam, 100)
+        fsa_size = k2host.IntArray2Size()
+        arc_derivs_size = k2host.IntArray2Size()
+        determinizer.get_sizes(fsa_size, arc_derivs_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        arc_derivs = k2host.LogSumArcDerivs.create_arc_derivs_with_size(
+            arc_derivs_size)
+        arc_weights_out = k2host.FloatArray1.create_array_with_size(
+            fsa_size.size2)
+        determinizer.get_output(fsa_out, arc_derivs)
+        self.assertTrue(k2host.is_deterministic(fsa_out))
+        self.assertEqual(fsa_out.size1, 7)
+        self.assertEqual(fsa_out.size2, 9)
+        self.assertEqual(arc_derivs.size1, 9)
+        self.assertEqual(arc_derivs.size2, 15)
+        self.assertTrue(
+            k2host.is_rand_equivalent_logsum_weight(self.fsa, fsa_out, beam))
+        # cast float to int
+        arc_ids = k2host.StridedIntArray1.from_float_tensor(
+            arc_derivs.data[:, 0])
+        # we may get different value of `arc_ids.get_data(1)`
+        # with different STL implementations as we use
+        # `std::unordered_map` in implementation of determinize
+        # self.assertEqual(arc_ids.get_data(1), 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/tests/fsa_equivalent_test.py b/k2/python/host/tests/fsa_equivalent_test.py
similarity index 53%
rename from k2/python/tests/fsa_equivalent_test.py
rename to k2/python/host/tests/fsa_equivalent_test.py
index 61fcc86b7..4f0682b0c 100644
--- a/k2/python/tests/fsa_equivalent_test.py
+++ b/k2/python/host/tests/fsa_equivalent_test.py
@@ -6,14 +6,14 @@
 
 # To run this single test, use
 #
-#  ctest --verbose -R fsa_equivalent_test_py
+#  ctest --verbose -R host_fsa_equivalent_test_py
 #
 
 import unittest
 
 import torch
 
-import k2
+import k2host
 
 
 class TestIsFsaEquivalent(unittest.TestCase):
@@ -27,15 +27,15 @@ def test_bad_case_1(self):
         2 3 5
         3
         '''
-        fsa_a = k2.str_to_fsa(s_a)
+        fsa_a = k2host.str_to_fsa(s_a)
         s_b = r'''
         0 1 1
         0 2 2
         1 2 3
         3
         '''
-        fsa_b = k2.str_to_fsa(s_b)
-        self.assertFalse(k2.is_rand_equivalent(fsa_a, fsa_b))
+        fsa_b = k2host.str_to_fsa(s_b)
+        self.assertFalse(k2host.is_rand_equivalent(fsa_a, fsa_b))
 
     def test_bad_case_2(self):
         s_a = r'''
@@ -46,7 +46,7 @@ def test_bad_case_2(self):
         2 3 5
         3
         '''
-        fsa_a = k2.str_to_fsa(s_a)
+        fsa_a = k2host.str_to_fsa(s_a)
         s_b = r'''
         0 1 1
         0 2 2
@@ -55,8 +55,8 @@ def test_bad_case_2(self):
         2 3 6
         3
         '''
-        fsa_b = k2.str_to_fsa(s_b)
-        self.assertFalse(k2.is_rand_equivalent(fsa_a, fsa_b, 100))
+        fsa_b = k2host.str_to_fsa(s_b)
+        self.assertFalse(k2host.is_rand_equivalent(fsa_a, fsa_b, 100))
 
     def test_good_case_1(self):
         # both fsas will be empty after triming
@@ -66,14 +66,14 @@ def test_good_case_1(self):
         1 2 3
         3
         '''
-        fsa_a = k2.str_to_fsa(s_a)
+        fsa_a = k2host.str_to_fsa(s_a)
         s_b = r'''
         0 1 1
         0 2 2
         3
         '''
-        fsa_b = k2.str_to_fsa(s_b)
-        self.assertTrue(k2.is_rand_equivalent(fsa_a, fsa_b))
+        fsa_b = k2host.str_to_fsa(s_b)
+        self.assertTrue(k2host.is_rand_equivalent(fsa_a, fsa_b))
 
     def test_good_case_2(self):
         # same fsas
@@ -85,8 +85,8 @@ def test_good_case_2(self):
         2 3 5
         3
         '''
-        fsa_a = k2.str_to_fsa(s_a)
-        self.assertTrue(k2.is_rand_equivalent(fsa_a, fsa_a))
+        fsa_a = k2host.str_to_fsa(s_a)
+        self.assertTrue(k2host.is_rand_equivalent(fsa_a, fsa_a))
 
     def test_bad_case_2(self):
         s_a = r'''
@@ -97,7 +97,7 @@ def test_bad_case_2(self):
         2 4 5
         4
         '''
-        fsa_a = k2.str_to_fsa(s_a)
+        fsa_a = k2host.str_to_fsa(s_a)
         s_b = r'''
         0 2 1
         0 1 2
@@ -106,8 +106,8 @@ def test_bad_case_2(self):
         2 4 4
         4
         '''
-        fsa_b = k2.str_to_fsa(s_b)
-        self.assertTrue(k2.is_rand_equivalent(fsa_a, fsa_b))
+        fsa_b = k2host.str_to_fsa(s_b)
+        self.assertTrue(k2host.is_rand_equivalent(fsa_a, fsa_b))
 
 
 class TestIsWfsaRandEquivalent(unittest.TestCase):
@@ -127,9 +127,9 @@ def setUp(self):
         4 5 -1
         5
         '''
-        self.fsa_a = k2.str_to_fsa(s_a)
+        self.fsa_a = k2host.str_to_fsa(s_a)
         weights_a = torch.FloatTensor([2, 2, 3, 3, 1, 3, 2, 5, 4, 1, 3])
-        self.weights_a = k2.FloatArray1(weights_a)
+        self.weights_a = k2host.FloatArray1(weights_a)
         s_b = r'''
         0 1 1
         0 1 2
@@ -141,9 +141,9 @@ def setUp(self):
         2 3 -1
         3
         '''
-        self.fsa_b = k2.str_to_fsa(s_b)
+        self.fsa_b = k2host.str_to_fsa(s_b)
         weights_b = torch.FloatTensor([5, 5, 6, 10, 8, 1, 0, 0])
-        self.weights_b = k2.FloatArray1(weights_b)
+        self.weights_b = k2host.FloatArray1(weights_b)
         s_c = r'''
         0 1 1
         0 1 2
@@ -155,49 +155,53 @@ def setUp(self):
         2 3 -1
         3
         '''
-        self.fsa_c = k2.str_to_fsa(s_c)
+        self.fsa_c = k2host.str_to_fsa(s_c)
         weights_c = torch.FloatTensor([5, 5, 6, 10, 9, 1, 0, 0])
-        self.weights_c = k2.FloatArray1(weights_c)
+        self.weights_c = k2host.FloatArray1(weights_c)
 
     def test_max_weight(self):
         self.assertTrue(
-            k2.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
-                                             self.fsa_b, self.weights_b))
+            k2host.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
+                                                 self.fsa_b, self.weights_b))
         self.assertFalse(
-            k2.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
-                                             self.fsa_c, self.weights_c))
+            k2host.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
+                                                 self.fsa_c, self.weights_c))
 
     def test_logsum_weight(self):
         self.assertTrue(
-            k2.is_rand_equivalent_logsum_weight(self.fsa_a, self.weights_a,
-                                                self.fsa_b, self.weights_b))
+            k2host.is_rand_equivalent_logsum_weight(self.fsa_a, self.weights_a,
+                                                    self.fsa_b,
+                                                    self.weights_b))
         self.assertFalse(
-            k2.is_rand_equivalent_logsum_weight(self.fsa_a, self.weights_a,
-                                                self.fsa_c, self.weights_c))
+            k2host.is_rand_equivalent_logsum_weight(self.fsa_a, self.weights_a,
+                                                    self.fsa_c,
+                                                    self.weights_c))
 
     def test_with_beam(self):
         self.assertTrue(
-            k2.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
-                                             self.fsa_b, self.weights_b, 4.0))
+            k2host.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
+                                                 self.fsa_b, self.weights_b,
+                                                 4.0))
         self.assertFalse(
-            k2.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
-                                             self.fsa_c, self.weights_c, 6.0))
+            k2host.is_rand_equivalent_max_weight(self.fsa_a, self.weights_a,
+                                                 self.fsa_c, self.weights_c,
+                                                 6.0))
 
 
 class TestRandPath(unittest.TestCase):
 
     def test_bad_case_1(self):
         # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        rand_path = k2.RandPath(fsa, False)
-        array_size = k2.IntArray2Size()
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        rand_path = k2host.RandPath(fsa, False)
+        array_size = k2host.IntArray2Size()
         rand_path.get_sizes(array_size)
-        path = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
+        path = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
         status = rand_path.get_output(path, arc_map)
         self.assertFalse(status)
-        self.assertTrue(k2.is_empty(path))
+        self.assertTrue(k2host.is_empty(path))
         self.assertTrue(arc_map.empty())
 
     def test_bad_case_2(self):
@@ -208,15 +212,15 @@ def test_bad_case_2(self):
         1 3 4
         3
         '''
-        fsa = k2.str_to_fsa(s_a)
-        rand_path = k2.RandPath(fsa, False)
-        array_size = k2.IntArray2Size()
+        fsa = k2host.str_to_fsa(s_a)
+        rand_path = k2host.RandPath(fsa, False)
+        array_size = k2host.IntArray2Size()
         rand_path.get_sizes(array_size)
-        path = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
+        path = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
         status = rand_path.get_output(path, arc_map)
         self.assertFalse(status)
-        self.assertTrue(k2.is_empty(path))
+        self.assertTrue(k2host.is_empty(path))
         self.assertTrue(arc_map.empty())
 
     def test_good_case_1(self):
@@ -230,14 +234,14 @@ def test_good_case_1(self):
         4 5 9
         5
         '''
-        fsa = k2.str_to_fsa(s_a)
-        rand_path = k2.RandPath(fsa, False)
-        array_size = k2.IntArray2Size()
+        fsa = k2host.str_to_fsa(s_a)
+        rand_path = k2host.RandPath(fsa, False)
+        array_size = k2host.IntArray2Size()
         rand_path.get_sizes(array_size)
-        path = k2.Fsa.create_fsa_with_size(array_size)
+        path = k2host.Fsa.create_fsa_with_size(array_size)
         status = rand_path.get_output(path)
         self.assertTrue(status)
-        self.assertFalse(k2.is_empty(path))
+        self.assertFalse(k2host.is_empty(path))
 
     def test_good_case_2(self):
         s_a = r'''
@@ -246,15 +250,15 @@ def test_good_case_2(self):
         2 3 4
         3
         '''
-        fsa = k2.str_to_fsa(s_a)
-        rand_path = k2.RandPath(fsa, False)
-        array_size = k2.IntArray2Size()
+        fsa = k2host.str_to_fsa(s_a)
+        rand_path = k2host.RandPath(fsa, False)
+        array_size = k2host.IntArray2Size()
         rand_path.get_sizes(array_size)
-        path = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
+        path = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
         status = rand_path.get_output(path, arc_map)
         self.assertTrue(status)
-        self.assertFalse(k2.is_empty(path))
+        self.assertFalse(k2host.is_empty(path))
         self.assertFalse(arc_map.empty())
         expected_arc_indexes = torch.IntTensor([0, 1, 2, 3, 3])
         expected_arcs = torch.IntTensor([[0, 1, 1], [1, 2, 3], [2, 3, 4]])
@@ -274,15 +278,15 @@ def test_eps_arc_1(self):
         4 5 9
         5
         '''
-        fsa = k2.str_to_fsa(s_a)
-        rand_path = k2.RandPath(fsa, True)
-        array_size = k2.IntArray2Size()
+        fsa = k2host.str_to_fsa(s_a)
+        rand_path = k2host.RandPath(fsa, True)
+        array_size = k2host.IntArray2Size()
         rand_path.get_sizes(array_size)
-        path = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
+        path = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
         status = rand_path.get_output(path, arc_map)
         self.assertTrue(status)
-        self.assertFalse(k2.is_empty(path))
+        self.assertFalse(k2host.is_empty(path))
         self.assertFalse(arc_map.empty())
 
     def test_eps_arc_2(self):
@@ -297,15 +301,15 @@ def test_eps_arc_2(self):
         4 5 9
         5
         '''
-        fsa = k2.str_to_fsa(s_a)
-        rand_path = k2.RandPath(fsa, True)
-        array_size = k2.IntArray2Size()
+        fsa = k2host.str_to_fsa(s_a)
+        rand_path = k2host.RandPath(fsa, True)
+        array_size = k2host.IntArray2Size()
         rand_path.get_sizes(array_size)
-        path = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
+        path = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map = k2host.IntArray1.create_array_with_size(array_size.size2)
         status = rand_path.get_output(path, arc_map)
         self.assertFalse(status)
-        self.assertTrue(k2.is_empty(path))
+        self.assertTrue(k2host.is_empty(path))
         self.assertTrue(arc_map.empty())
 
 
diff --git a/k2/python/tests/fsa_test.py b/k2/python/host/tests/fsa_test.py
similarity index 55%
rename from k2/python/tests/fsa_test.py
rename to k2/python/host/tests/fsa_test.py
index 5f98dd173..44b87c5f1 100644
--- a/k2/python/tests/fsa_test.py
+++ b/k2/python/host/tests/fsa_test.py
@@ -6,59 +6,67 @@
 
 # To run this single test, use
 #
-#  ctest --verbose -R fsa_test_py
+#  ctest --verbose -R host_fsa_test_py
 #
 
 import unittest
 
 import torch
 
-import k2
+import k2host
+from k2host.fsa_util import float_to_int
 
 
 class TestFsa(unittest.TestCase):
 
     def test_arc(self):
         # construct arc
-        arc = k2.Arc(1, 2, 3)
+        arc = k2host.Arc(1, 2, 3, 1.5)
         self.assertEqual(arc.src_state, 1)
         self.assertEqual(arc.dest_state, 2)
         self.assertEqual(arc.label, 3)
+        self.assertEqual(arc.weight, 1.5)
 
         # test from_tensor
-        arc_tensor = torch.tensor([1, 2, 3], dtype=torch.int32)
-        arc = k2.Arc.from_tensor(arc_tensor)
+        arc_tensor = torch.tensor([1, 2, 3, 0], dtype=torch.int32)
+        arc = k2host.Arc.from_tensor(arc_tensor)
         self.assertEqual(arc.src_state, 1)
         self.assertEqual(arc.dest_state, 2)
         self.assertEqual(arc.label, 3)
+        self.assertEqual(arc.weight, 0)
 
         # test to_tensor
         arc.src_state = 2
         arc_tensor = arc.to_tensor()
-        arc_tensor_target = torch.tensor([2, 2, 3], dtype=torch.int32)
+        arc_tensor_target = torch.tensor([2, 2, 3, 0], dtype=torch.int32)
         self.assertTrue(torch.equal(arc_tensor, arc_tensor_target))
 
     def test_fsa(self):
         s = r'''
-        0 1 1
-        0 2 2
-        1 3 3
-        2 3 3
-        3 4 -1
+        0 1 1 1.25
+        0 2 2 1.5
+        1 3 3 1.75
+        2 3 3 2.25
+        3 4 -1 2.5
         4
         '''
 
-        fsa = k2.str_to_fsa(s)
+        fsa = k2host.str_to_fsa(s)
         self.assertEqual(fsa.num_states(), 5)
         self.assertEqual(fsa.final_state(), 4)
         self.assertFalse(fsa.empty())
-        self.assertIsInstance(fsa, k2.Fsa)
+        self.assertIsInstance(fsa, k2host.Fsa)
         # test get_data
         self.assertEqual(fsa.get_data(0).src_state, 0)
         self.assertEqual(fsa.get_data(0).dest_state, 1)
         self.assertEqual(fsa.get_data(0).label, 1)
-        # fsa.data and the corresponding k2::Fsa object are sharing memory
-        fsa.data[0] = torch.IntTensor([5, 1, 6])
+        self.assertEqual(fsa.get_data(0).weight, 1.25)
+        self.assertEqual(fsa.get_data(1).weight, 1.5)
+        self.assertEqual(fsa.get_data(2).weight, 1.75)
+        self.assertEqual(fsa.get_data(3).weight, 2.25)
+        self.assertEqual(fsa.get_data(4).weight, 2.5)
+        # fsa.data and the corresponding k2host::Fsa object are sharing memory
+        fsa.data[0] = torch.IntTensor([5, 1, 6, 1])
         self.assertEqual(fsa.get_data(0).src_state, 5)
 
 
diff --git a/k2/python/host/tests/intersect_test.py b/k2/python/host/tests/intersect_test.py
new file mode 100644
index 000000000..192e16721
--- /dev/null
+++ b/k2/python/host/tests/intersect_test.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_intersect_test_py
+#
+
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestIntersection(unittest.TestCase):
+
+    def test_case_1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa_a = k2host.Fsa.create_fsa_with_size(array_size)
+        fsa_b = k2host.Fsa.create_fsa_with_size(array_size)
+        intersection = k2host.Intersection(fsa_a, fsa_b)
+        array_size = k2host.IntArray2Size()
+        intersection.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map_a = k2host.IntArray1.create_array_with_size(array_size.size2)
+        arc_map_b = k2host.IntArray1.create_array_with_size(array_size.size2)
+        status = intersection.get_output(fsa_out, arc_map_a, arc_map_b)
+        self.assertTrue(status)
+        self.assertTrue(k2host.is_empty(fsa_out))
+        self.assertTrue(arc_map_a.empty())
+        self.assertTrue(arc_map_b.empty())
+
+        # test without arc_map
+        status = intersection.get_output(fsa_out)
+        self.assertTrue(status)
+        self.assertTrue(k2host.is_empty(fsa_out))
+
+    def test_case_2(self):
+        s_a = r'''
+        0 1 1 0
+        1 2 0 0
+        1 3 1 0
+        1 4 2 0
+        2 2 1 0
+        2 3 1 0
+        2 3 2 0
+        3 3 0 0
+        3 4 1 0
+        4
+        '''
+
+        fsa_a = k2host.str_to_fsa(s_a)
+
+        s_b = r'''
+        0 1 1 0
+        1 3 1 0
+        1 2 2 0
+        2 3 1 0
+        3
+        '''
+
+        fsa_b = k2host.str_to_fsa(s_b)
+        intersection = k2host.Intersection(fsa_a, fsa_b)
+        array_size = k2host.IntArray2Size()
+        intersection.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        arc_map_a = k2host.IntArray1.create_array_with_size(array_size.size2)
+        arc_map_b = k2host.IntArray1.create_array_with_size(array_size.size2)
+        status = intersection.get_output(fsa_out, arc_map_a, arc_map_b)
+        self.assertTrue(status)
+        expected_arc_indexes = torch.IntTensor([0, 1, 4, 7, 8, 8, 8, 10, 10])
+        expected_arcs = torch.IntTensor([[0, 1, 1, 0], [1, 2, 0, 0],
+                                         [1, 3, 1, 0], [1, 4, 2, 0],
+                                         [2, 5, 1, 0], [2, 3, 1, 0],
+                                         [2, 6, 2, 0], [3, 3, 0, 0],
+                                         [6, 6, 0, 0], [6, 7, 1, 0]])
+        expected_arc_map_a = torch.IntTensor([0, 1, 2, 3, 4, 5, 6, 7, 7, 8])
+        expected_arc_map_b = torch.IntTensor([0, -1, 1, 2, 1, 1, 2, -1, -1, 3])
+        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
+        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
+        self.assertTrue(torch.equal(arc_map_a.data, expected_arc_map_a))
+        self.assertTrue(torch.equal(arc_map_b.data, expected_arc_map_b))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/host/tests/properties_test.py b/k2/python/host/tests/properties_test.py
new file mode 100644
index 000000000..2533abab9
--- /dev/null
+++ b/k2/python/host/tests/properties_test.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_properties_test_py
+#
+
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestIsValid(unittest.TestCase):
+
+    def test_bad_case1(self):
+        # fsa should contain at least two states
+        array_size = k2host.IntArray2Size(1, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertFalse(k2host.is_valid(fsa))
+
+    def test_bad_case2(self):
+        # only kFinalSymbol arcs enter the final state
+        s = r'''
+        0 1 0 0
+        0 2 1 0
+        1 2 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_valid(fsa))
+
+    def test_bad_case3(self):
+        # `arc_indexes` and `arcs` in this state are not consistent
+        arc_indexes = torch.IntTensor([0, 2, 2, 2])
+        arcs = torch.IntTensor([[0, 1, 0, 0], [0, 2, 1, 0], [1, 2, 0, 0]])
+        fsa = k2host.Fsa(arc_indexes, arcs)
+        self.assertFalse(k2host.is_valid(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa is valid
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_valid(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 0 0
+        0 2 0 0
+        2 3 -1 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_valid(fsa))
+
+    def test_good_case3(self):
+        s = r'''
+        0 1 0 0
+        0 2 -1 0
+        1 2 -1 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_valid(fsa))
+
+
+class TestIsTopSorted(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 1 0 0
+        0 2 0 0
+        2 1 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_top_sorted(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_top_sorted(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 0 0
+        0 2 0 0
+        1 2 0 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_top_sorted(fsa))
+
+
+class TestIsArcSorted(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 1 1 0
+        0 2 2 0
+        1 2 2 0
+        1 3 1 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_arc_sorted(fsa))
+
+    def test_bad_cases2(self):
+        # same label on two arcs
+        s = r'''
+        0 2 0 0
+        0 1 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_arc_sorted(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_arc_sorted(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 0 0
+        0 2 0 0
+        1 2 1 0
+        1 3 2 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_arc_sorted(fsa))
+
+
+class TestHasSelfLoops(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 1 0 0
+        0 2 0 0
+        1 2 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.has_self_loops(fsa))
+
+    def test_bad_cases2(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertFalse(k2host.has_self_loops(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 0 0
+        1 2 0 0
+        1 1 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.has_self_loops(fsa))
+
+
+class TestIsDeterministic(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 1 2 0
+        1 2 0 0
+        1 3 0 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_deterministic(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_deterministic(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 2 0
+        1 2 0 0
+        1 3 2 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_deterministic(fsa))
+
+
+class TestIsEpsilonFree(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 1 2 0
+        0 2 0 0
+        1 2 1 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_epsilon_free(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_epsilon_free(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 2 0
+        0 2 1 0
+        1 2 1 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_epsilon_free(fsa))
+
+
+class TestIsConnected(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 2 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_connected(fsa))
+
+    def test_bad_cases2(self):
+        s = r'''
+        0 1 0 0
+        0 2 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_connected(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_connected(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 0 0
+        0 3 0 0
+        1 2 0 0
+        2 3 0 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_connected(fsa))
+
+    def test_good_case3(self):
+        s = r'''
+        0 3 0 0
+        1 2 0 0
+        2 3 0 0
+        2 3 0 0
+        2 4 0 0
+        3 1 0 0
+        4
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_connected(fsa))
+
+
+class TestIsAcyclic(unittest.TestCase):
+
+    def test_bad_cases1(self):
+        s = r'''
+        0 1 2 0
+        0 4 0 0
+        0 2 0 0
+        1 2 1 0
+        1 3 0 0
+        2 1 0 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_acyclic(fsa))
+
+    def test_good_cases1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_acyclic(fsa))
+
+    def test_good_case2(self):
+        s = r'''
+        0 1 2 0
+        0 2 1 0
+        1 2 0 0
+        1 3 5 0
+        2 3 6 0
+        3
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertTrue(k2host.is_acyclic(fsa))
+
+
+class TestIsEmpty(unittest.TestCase):
+
+    def test_good_cases1(self):
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        self.assertTrue(k2host.is_empty(fsa))
+
+    def test_bad_case1(self):
+        s = r'''
+        0 1 2 0
+        1
+        '''
+        fsa = k2host.str_to_fsa(s)
+        self.assertFalse(k2host.is_empty(fsa))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/host/tests/rmepsilon_test.py b/k2/python/host/tests/rmepsilon_test.py
new file mode 100644
index 000000000..c2a90442c
--- /dev/null
+++ b/k2/python/host/tests/rmepsilon_test.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_rmepsilon_test_py
+#
+
+from struct import pack, unpack
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestRmEpsilon(unittest.TestCase):
+
+    def setUp(self):
+        s = r'''
+        0 4 1 1
+        0 1 1 1
+        1 2 0 2
+        1 3 0 3
+        1 4 0 2
+        2 7 0 4
+        3 7 0 5
+        4 6 1 2
+        4 6 0 3
+        4 8 1 3
+        4 9 -1 2
+        5 9 -1 4
+        6 9 -1 3
+        7 9 -1 5
+        8 9 -1 6
+        9
+        '''
+        self.fsa = k2host.str_to_fsa(s)
+        self.num_states = self.fsa.num_states()
+
+    def test_max_weight(self):
+        forward_max_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        backward_max_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        wfsa = k2host.WfsaWithFbWeights(self.fsa,
+                                        k2host.FbWeightType.kMaxWeight,
+                                        forward_max_weights,
+                                        backward_max_weights)
+        beam = 8.0
+        remover = k2host.EpsilonsRemoverMax(wfsa, beam)
+        fsa_size = k2host.IntArray2Size()
+        arc_derivs_size = k2host.IntArray2Size()
+        remover.get_sizes(fsa_size, arc_derivs_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        arc_derivs = k2host.IntArray2.create_array_with_size(arc_derivs_size)
+        arc_weights_out = k2host.FloatArray1.create_array_with_size(
+            fsa_size.size2)
+        remover.get_output(fsa_out, arc_derivs)
+        self.assertTrue(k2host.is_epsilon_free(fsa_out))
+        self.assertEqual(fsa_out.size1, 6)
+        self.assertEqual(fsa_out.size2, 11)  # TODO: fix this
+        self.assertEqual(arc_derivs.size1, 11)  # TODO: fix this
+        self.assertEqual(arc_derivs.size2, 18)  # TODO: fix this
+        self.assertTrue(
+            k2host.is_rand_equivalent_max_weight(self.fsa, fsa_out, beam))
+
+    def test_logsum_weight(self):
+        forward_logsum_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        backward_logsum_weights = k2host.DoubleArray1.create_array_with_size(
+            self.num_states)
+        wfsa = k2host.WfsaWithFbWeights(self.fsa,
+                                        k2host.FbWeightType.kLogSumWeight,
+                                        forward_logsum_weights,
+                                        backward_logsum_weights)
+        beam = 8.0
+        remover = k2host.EpsilonsRemoverLogSum(wfsa, beam)
+        fsa_size = k2host.IntArray2Size()
+        arc_derivs_size = k2host.IntArray2Size()
+        remover.get_sizes(fsa_size, arc_derivs_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(fsa_size)
+        arc_derivs = k2host.LogSumArcDerivs.create_arc_derivs_with_size(
+            arc_derivs_size)
+        arc_weights_out = k2host.FloatArray1.create_array_with_size(
+            fsa_size.size2)
+        remover.get_output(fsa_out, arc_derivs)
+        self.assertTrue(k2host.is_epsilon_free(fsa_out))
+        self.assertEqual(fsa_out.size1, 6)
+        self.assertEqual(fsa_out.size2, 11)  # TODO: fix this
+        self.assertEqual(arc_derivs.size1, 11)  # TODO: fix this
+        self.assertEqual(arc_derivs.size2, 20)  # TODO: fix this
+        self.assertTrue(
+            k2host.is_rand_equivalent_after_rmeps_pruned_logsum(
+                self.fsa, fsa_out, beam))
+        # cast float to int
+        arc_ids = k2host.StridedIntArray1.from_float_tensor(
+            arc_derivs.data[:, 0])
+        # we may get different value of `arc_ids.get_data(1)`
+        # with different STL implementations as we use
+        # `std::unordered_map` in implementation of rmepsilon,
+        # thus below assertion may fail on some platforms.
+        self.assertEqual(arc_ids.get_data(1), 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/host/tests/topsort_test.py b/k2/python/host/tests/topsort_test.py
new file mode 100644
index 000000000..ecadac7a9
--- /dev/null
+++ b/k2/python/host/tests/topsort_test.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+#
+# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+
+# To run this single test, use
+#
+#  ctest --verbose -R host_topsort_test_py
+#
+
+import unittest
+
+import torch
+
+import k2host
+
+
+class TestTopSorter(unittest.TestCase):
+
+    def test_case_1(self):
+        # empty fsa
+        array_size = k2host.IntArray2Size(0, 0)
+        fsa = k2host.Fsa.create_fsa_with_size(array_size)
+        sorter = k2host.TopSorter(fsa)
+        array_size = k2host.IntArray2Size()
+        sorter.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        state_map = k2host.IntArray1.create_array_with_size(array_size.size1)
+        status = sorter.get_output(fsa_out, state_map)
+        self.assertTrue(status)
+        self.assertTrue(k2host.is_empty(fsa_out))
+        self.assertTrue(state_map.empty())
+
+        # test without arc_map
+        sorter.get_output(fsa_out)
+        self.assertTrue(k2host.is_empty(fsa_out))
+
+    def test_case_2(self):
+        # non-connected fsa (not co-accessible)
+        s = r'''
+        0 2 -1 0
+        1 2 -1 0
+        1 2 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        sorter = k2host.TopSorter(fsa)
+        array_size = k2host.IntArray2Size()
+        sorter.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        state_map = k2host.IntArray1.create_array_with_size(array_size.size1)
+        status = sorter.get_output(fsa_out, state_map)
+        self.assertFalse(status)
+        self.assertTrue(k2host.is_empty(fsa_out))
+        self.assertTrue(state_map.empty())
+
+    def test_case_3(self):
+        # non-connected fsa (not accessible)
+        s = r'''
+        0 2 -1 0
+        1 0 1 0
+        1 2 0 0
+        2
+        '''
+        fsa = k2host.str_to_fsa(s)
+        sorter = k2host.TopSorter(fsa)
+        array_size = k2host.IntArray2Size()
+        sorter.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        state_map = k2host.IntArray1.create_array_with_size(array_size.size1)
+        status = sorter.get_output(fsa_out, state_map)
+        self.assertFalse(status)
+        self.assertTrue(k2host.is_empty(fsa_out))
+        self.assertTrue(state_map.empty())
+
+    def test_case_4(self):
+        # connected fsa
+        s = r'''
+        0 4 40 0
+        0 2 20 0
+        1 6 -1 0
+        2 3 30 0
+        3 6 -1 0
+        3 1 10 0
+        4 5 50 0
+        5 2 8 0
+        6
+        '''
+        fsa = k2host.str_to_fsa(s)
+        sorter = k2host.TopSorter(fsa)
+        array_size = k2host.IntArray2Size()
+        sorter.get_sizes(array_size)
+        fsa_out = k2host.Fsa.create_fsa_with_size(array_size)
+        state_map = k2host.IntArray1.create_array_with_size(array_size.size1)
+        status = sorter.get_output(fsa_out, state_map)
+        self.assertTrue(status)
+        expected_arc_indexes = torch.IntTensor([0, 2, 3, 4, 5, 7, 8, 8])
+        expected_arcs = torch.IntTensor([[0, 1, 40, 0], [0, 3, 20, 0],
+                                         [1, 2, 50, 0], [2, 3, 8, 0],
+                                         [3, 4, 30, 0], [4, 6, -1, 0],
+                                         [4, 5, 10, 0], [5, 6, -1, 0]])
+        expected_state_map = torch.IntTensor([0, 4, 5, 2, 3, 1, 6])
+        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
+        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
+        self.assertTrue(torch.equal(state_map.data, expected_state_map))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/k2/python/tests/weights_test.py b/k2/python/host/tests/weights_test.py
similarity index 67%
rename from k2/python/tests/weights_test.py
rename to k2/python/host/tests/weights_test.py
index b66e273a5..4eef9d06f 100644
--- a/k2/python/tests/weights_test.py
+++ b/k2/python/host/tests/weights_test.py
@@ -6,14 +6,14 @@
 
 # To run this single test, use
 #
-#  ctest --verbose -R weights_test_py
+#  ctest --verbose -R host_weights_test_py
 #
 
 import unittest
 
 import torch
 
-import k2
+import k2host
 
 
 class TestWfsa(unittest.TestCase):
@@ -34,19 +34,20 @@ def setUp(self):
         8 9 -1
         9
         '''
-        self.fsa = k2.str_to_fsa(s)
+        self.fsa = k2host.str_to_fsa(s)
         self.num_states = self.fsa.num_states()
         weights = torch.FloatTensor([1, 1, 2, 3, 4, 5, 2, 3, 4, 3, 5, 6])
-        self.weights = k2.FloatArray1(weights)
+        self.weights = k2host.FloatArray1(weights)
 
     def test_max_weight(self):
-        forward_max_weights = k2.DoubleArray1.create_array_with_size(
+        forward_max_weights = k2host.DoubleArray1.create_array_with_size(
             self.num_states)
-        backward_max_weights = k2.DoubleArray1.create_array_with_size(
+        backward_max_weights = k2host.DoubleArray1.create_array_with_size(
             self.num_states)
-        wfsa = k2.WfsaWithFbWeights(self.fsa, self.weights,
-                                    k2.FbWeightType.kMaxWeight,
-                                    forward_max_weights, backward_max_weights)
+        wfsa = k2host.WfsaWithFbWeights(self.fsa, self.weights,
+                                        k2host.FbWeightType.kMaxWeight,
+                                        forward_max_weights,
+                                        backward_max_weights)
         expected_forward_max_weights = torch.DoubleTensor(
             [0, 1, 3, 4, 1, float('-inf'), 3, 9, 4, 14])
         expected_backward_max_weights = torch.DoubleTensor(
@@ -62,14 +63,14 @@ def test_max_weight(self):
                            expected_backward_max_weights))
 
     def test_logsum_weight(self):
-        forward_logsum_weights = k2.DoubleArray1.create_array_with_size(
+        forward_logsum_weights = k2host.DoubleArray1.create_array_with_size(
             self.num_states)
-        backward_logsum_weights = k2.DoubleArray1.create_array_with_size(
+        backward_logsum_weights = k2host.DoubleArray1.create_array_with_size(
             self.num_states)
-        wfsa = k2.WfsaWithFbWeights(self.fsa, self.weights,
-                                    k2.FbWeightType.kLogSumWeight,
-                                    forward_logsum_weights,
-                                    backward_logsum_weights)
+        wfsa = k2host.WfsaWithFbWeights(self.fsa, self.weights,
+                                        k2host.FbWeightType.kLogSumWeight,
+                                        forward_logsum_weights,
+                                        backward_logsum_weights)
         expected_forward_logsum_weights = torch.DoubleTensor(
             [0, 1, 3, 4, 1,
              float('-inf'), 3, 9.126928, 4, 14.143222])
diff --git a/k2/python/k2/__init__.py b/k2/python/k2/__init__.py
index 78fca0185..f08a26825 100644
--- a/k2/python/k2/__init__.py
+++ b/k2/python/k2/__init__.py
@@ -1,10 +1,3 @@
-from _k2 import IntArray2Size
-from _k2 import FbWeightType
-from .array import *
-from .aux_labels import *
-from .fsa import *
-from .fsa_algo import *
-from .fsa_equivalent import *
-from .fsa_util import str_to_fsa
-from .properties import *
-from .weights import *
+from .array import Array
+
+__all__ = ['Array']
diff --git a/k2/python/k2/array.py b/k2/python/k2/array.py
index 9a5c5f495..a03f1dfda 100644
--- a/k2/python/k2/array.py
+++ b/k2/python/k2/array.py
@@ -1,107 +1,59 @@
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-
+# Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
+#
 # See ../../../LICENSE for clarification regarding multiple authors
 
-import torch
-from torch.utils.dlpack import to_dlpack
-
-from _k2 import IntArray2Size
-from _k2 import DLPackIntArray2
-from _k2 import DLPackIntArray1
-from _k2 import DLPackStridedIntArray1
-from _k2 import DLPackFloatArray1
-from _k2 import DLPackDoubleArray1
-from _k2 import DLPackLogSumArcDerivs
-
-
-class IntArray1(DLPackIntArray1):
-
-    def __init__(self, data: torch.Tensor, check_dtype: bool = True):
-        if check_dtype:
-            assert data.dtype == torch.int32
-        self.data = data
-        super().__init__(to_dlpack(self.data))
-
-    @staticmethod
-    def from_float_tensor(data: torch.Tensor) -> 'IntArray1':
-        assert data.dtype == torch.float
-        return IntArray1(data, False)
-
-    @staticmethod
-    def create_array_with_size(size: int) -> 'IntArray1':
-        data = torch.zeros(size, dtype=torch.int32)
-        return IntArray1(data)
-
-
-class StridedIntArray1(DLPackStridedIntArray1):
-
-    def __init__(self, data: torch.Tensor, check_dtype: bool = True):
-        if check_dtype:
-            assert data.dtype == torch.int32
-        self.data = data
-        super().__init__(to_dlpack(self.data))
-
-    @staticmethod
-    def from_float_tensor(data: torch.Tensor) -> 'StridedIntArray1':
-        assert data.dtype == torch.float
-        return StridedIntArray1(data, False)
-
-
-class FloatArray1(DLPackFloatArray1):
+from typing import Union
 
-    def __init__(self, data: torch.Tensor):
-        assert data.dtype == torch.float
-        self.data = data
-        super().__init__(to_dlpack(self.data))
-
-    @staticmethod
-    def create_array_with_size(size: int) -> 'FloatArray1':
-        data = torch.zeros(size, dtype=torch.float)
-        return FloatArray1(data)
-
-
-class DoubleArray1(DLPackDoubleArray1):
-
-    def __init__(self, data: torch.Tensor):
-        assert data.dtype == torch.double
-        self.data = data
-        super().__init__(to_dlpack(self.data))
-
-    @staticmethod
-    def create_array_with_size(size: int) -> 'DoubleArray1':
-        data = torch.zeros(size, dtype=torch.double)
-        return DoubleArray1(data)
-
-
-class IntArray2(DLPackIntArray2):
-
-    def __init__(self, indexes: torch.Tensor, data: torch.Tensor):
-        assert indexes.dtype == torch.int32
-        assert data.dtype == torch.int32
-        self.indexes = indexes
-        self.data = data
-        super().__init__(to_dlpack(self.indexes), to_dlpack(self.data))
-
-    @staticmethod
-    def create_array_with_size(array_size: IntArray2Size) -> 'IntArray2':
-        indexes = torch.zeros(array_size.size1 + 1, dtype=torch.int32)
-        data = torch.zeros(array_size.size2, dtype=torch.int32)
-        return IntArray2(indexes, data)
-
-
-class LogSumArcDerivs(DLPackLogSumArcDerivs):
-
-    def __init__(self, indexes: torch.Tensor, data: torch.Tensor):
-        assert indexes.dtype == torch.int32
-        assert data.dtype == torch.float32
-        assert data.shape[1] == 2
-        self.indexes = indexes
-        self.data = data
-        super().__init__(to_dlpack(self.indexes), to_dlpack(self.data))
+import torch
 
-    @staticmethod
-    def create_arc_derivs_with_size(
-            array_size: IntArray2Size) -> 'LogSumArcDerivs':
-        indexes = torch.zeros(array_size.size1 + 1, dtype=torch.int32)
-        data = torch.zeros([array_size.size2, 2], dtype=torch.float32)
-        return LogSumArcDerivs(indexes, data)
+from _k2 import _FloatArray1
+from _k2 import _Int32Array1
+
+
+def _to_float_array1(tensor: torch.Tensor) -> _FloatArray1:
+    return _FloatArray1.from_tensor(tensor)
+
+
+def _to_int32_array1(tensor: torch.Tensor) -> _Int32Array1:
+    return _Int32Array1.from_tensor(tensor)
+
+
+def _from_tensor(tensor: torch.Tensor) -> Union[_FloatArray1, _Int32Array1]:
+    '''Return an `Array` sharing memory with the passed `torch.Tensor`.
+    '''
+    data: Union[_FloatArray1, _Int32Array1]
+    if tensor.ndim == 1:
+        if tensor.dtype == torch.int32:
+            data = _to_int32_array1(tensor)
+        elif tensor.dtype == torch.float:
+            data = _to_float_array1(tensor)
+        else:
+            # TODO(fangjun): support other data types
+            raise ValueError(f'Unsupported dtype {tensor.dtype}')
+    else:
+        # TODO(fangjun): support Array2
+        raise ValueError(f'Unsupported dimension {tensor.ndim}')
+    return data
+
+
+class Array(object):
+    '''This class wraps k2::Array1<T> and k2::Array2<T> from C++.
+
+    It has only one method `tensor()` which returns a `torch.Tensor`.
+    '''
+
+    def __init__(self, data: Union[torch.Tensor, _FloatArray1, _Int32Array1]):
+        '''Construct an `Array` from a `torch.Tensor` or from one of
+        `k2::Array1<T>` and `k2::Array2<T>`.
+        '''
+        if isinstance(data, torch.Tensor):
+            self.data = _from_tensor(data)
+        elif isinstance(data, (_FloatArray1, _Int32Array1)):
+            self.data = data
+        else:
+            raise ValueError(f'Unsupported type {type(data)}')
+
+    def tensor(self) -> torch.Tensor:
+        '''Return a `torch.Tensor` sharing memory with the underlying `Array`.
+        '''
+        return self.data.tensor()
diff --git a/k2/python/tests/CMakeLists.txt b/k2/python/tests/CMakeLists.txt
index ad26ebedb..722f9eeab 100644
--- a/k2/python/tests/CMakeLists.txt
+++ b/k2/python/tests/CMakeLists.txt
@@ -17,18 +17,7 @@ endfunction()
 
 # please sort the files in alphabetic order
 set(py_test_files
-    arcsort_test.py
-    array_test.py
-    aux_labels_test.py
-    connect_test.py
-    determinize_test.py
-    fsa_equivalent_test.py
-    fsa_test.py
-    intersect_test.py
-    properties_test.py
-    rmepsilon_test.py
-    topsort_test.py
-    weights_test.py
+  array_test.py
 )
 
 foreach(source IN LISTS py_test_files)
diff --git a/k2/python/tests/arcsort_test.py b/k2/python/tests/arcsort_test.py
deleted file mode 100644
index 9269fee73..000000000
--- a/k2/python/tests/arcsort_test.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R arcsort_test_py
-#
-
-import unittest
-
-import torch
-
-import k2
-
-
-class TestArcSort(unittest.TestCase):
-
-    def test_empty_fsa(self):
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(fsa.size2)
-        k2.arc_sort(fsa, arc_map)
-        self.assertTrue(k2.is_empty(fsa))
-        self.assertTrue(arc_map.empty())
-
-        # test without arc_map
-        k2.arc_sort(fsa)
-        self.assertTrue(k2.is_empty(fsa))
-
-    def test_arc_sort(self):
-        s = r'''
-        0 1 2
-        0 4 0
-        0 2 0
-        1 2 1
-        1 3 0
-        2 1 0
-        4
-        '''
-
-        fsa = k2.str_to_fsa(s)
-        arc_map = k2.IntArray1.create_array_with_size(fsa.size2)
-        k2.arc_sort(fsa, arc_map)
-        expected_arc_indexes = torch.IntTensor([0, 3, 5, 6, 6, 6])
-        expected_arcs = torch.IntTensor([[0, 2, 0], [0, 4, 0], [0, 1, 2],
-                                         [1, 3, 0], [1, 2, 1], [2, 1, 0]])
-        expected_arc_map = torch.IntTensor([2, 1, 0, 4, 3, 5])
-        self.assertTrue(torch.equal(fsa.indexes, expected_arc_indexes))
-        self.assertTrue(torch.equal(fsa.data, expected_arcs))
-        self.assertTrue(torch.equal(arc_map.data, expected_arc_map))
-
-
-class TestArcSorter(unittest.TestCase):
-
-    def test_empty_fsa(self):
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        sorter = k2.ArcSorter(fsa)
-        array_size = k2.IntArray2Size()
-        sorter.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
-        sorter.get_output(fsa_out, arc_map)
-        self.assertTrue(k2.is_empty(fsa))
-
-        # test without arc_map
-        sorter.get_output(fsa_out)
-        self.assertTrue(k2.is_empty(fsa_out))
-
-    def test_arc_sort(self):
-        s = r'''
-        0 1 2
-        0 4 0
-        0 2 0
-        1 2 1
-        1 3 0
-        2 1 0
-        4
-        '''
-
-        fsa = k2.str_to_fsa(s)
-        sorter = k2.ArcSorter(fsa)
-        array_size = k2.IntArray2Size()
-        sorter.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
-        sorter.get_output(fsa_out, arc_map)
-        expected_arc_indexes = torch.IntTensor([0, 3, 5, 6, 6, 6])
-        expected_arcs = torch.IntTensor([[0, 2, 0], [0, 4, 0], [0, 1, 2],
-                                         [1, 3, 0], [1, 2, 1], [2, 1, 0]])
-        expected_arc_map = torch.IntTensor([2, 1, 0, 4, 3, 5])
-        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
-        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
-        self.assertTrue(torch.equal(arc_map.data, expected_arc_map))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/k2/python/tests/array_test.py b/k2/python/tests/array_test.py
index dd7f600c5..d4fac6f49 100644
--- a/k2/python/tests/array_test.py
+++ b/k2/python/tests/array_test.py
@@ -1,106 +1,120 @@
 #!/usr/bin/env python3
 #
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
+# Copyright (c)  2020  Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
 #
 # See ../../../LICENSE for clarification regarding multiple authors
 
 # To run this single test, use
 #
 #  ctest --verbose -R array_test_py
-#
 
-from struct import pack, unpack
 import unittest
 
+import k2
 import torch
 
-import k2
+import _k2  # for test only, users should not import it.
 
 
 class TestArray(unittest.TestCase):
 
-    def test_int_array1(self):
-        data = torch.arange(10).to(torch.int32)
-
-        array = k2.IntArray1(data)
-        self.assertFalse(array.empty())
-        self.assertIsInstance(array, k2.IntArray1)
-        self.assertEqual(data.numel(), array.size)
-        self.assertEqual(array.data[9], 9)
-
-        # the underlying memory is shared between k2 and torch;
-        # so change one will change another
-        data[0] = 100
-        self.assertEqual(array.data[0], 100)
-        self.assertEqual(array.get_data(0), 100)
-
-        del data
-        # the array in k2 is still accessible
-        self.assertEqual(array.data[0], 100)
-        self.assertEqual(array.get_data(0), 100)
-
-    def test_int_array2(self):
-        data = torch.arange(10).to(torch.int32)
-        indexes = torch.tensor([0, 2, 5, 6, 10]).to(torch.int32)
-        self.assertEqual(data.numel(), indexes[-1].item())
-
-        array = k2.IntArray2(indexes, data)
-        self.assertFalse(array.empty())
-        self.assertIsInstance(array, k2.IntArray2)
-
-        self.assertEqual(indexes.numel(), array.size1 + 1)
-        self.assertEqual(data.numel(), array.size2)
-        self.assertEqual(array.data[9], 9)
-
-        # the underlying memory is shared between k2 and torch;
-        # so change one will change another
-        data[0] = 100
-        self.assertEqual(array.data[0], 100)
-        self.assertEqual(array.get_data(0), 100)
-        indexes[1] = 3
-        self.assertEqual(array.indexes[1], 3)
-        self.assertEqual(array.get_indexes(1), 3)
-
-        del data
-        del indexes
-        # the array in k2 is still accessible
-        self.assertEqual(array.data[0], 100)
-        self.assertEqual(array.get_data(0), 100)
-        self.assertEqual(array.indexes[1], 3)
-        self.assertEqual(array.get_indexes(1), 3)
-
-    def test_logsum_arc_derivs(self):
-        data = torch.arange(10).reshape(5, 2).to(torch.float)
-        indexes = torch.tensor([0, 2, 3, 5]).to(torch.int32)
-        self.assertEqual(data.shape[0], indexes[-1].item())
-
-        array = k2.LogSumArcDerivs(indexes, data)
-        self.assertFalse(array.empty())
-        self.assertIsInstance(array, k2.LogSumArcDerivs)
-
-        self.assertEqual(indexes.numel(), array.size1 + 1)
-        self.assertEqual(data.shape[0], array.size2)
-        self.assertTrue(torch.equal(array.data[1], torch.FloatTensor([2, 3])))
-
-        # convert arc-ids in arc-derivs to IntArray
-        arc_ids = k2.StridedIntArray1.from_float_tensor(array.data[:, 0])
-        # the underlying memory is shared between k2 and torch;
-        # so change one will change another
-        data[1] = torch.FloatTensor([100, 200])
-        self.assertTrue(
-            torch.equal(array.data[1], torch.FloatTensor([100, 200])))
-        self.assertEqual(array.get_data(1)[1], 200)
-        self.assertEqual(arc_ids.data[1], 100)
-        # we need pack and then unpack here to interpret arc_id (int) as a float,
-        # this is only for test purpose as users would usually never call
-        # `array.get_data` to retrieve data. Instead, it is supposed to call
-        # `array.data` to retrieve or update data in the array object.
-        arc_id = pack('i', array.get_data(1)[0])
-        self.assertEqual(unpack('f', arc_id)[0], 100)
-
-        del data
-        # the array in k2 is still accessible
-        self.assertEqual(array.get_data(1)[1], 200)
+    def test_cpu_int_array1_to_tensor(self):
+        _arr = _k2.get_cpu_int_array1()
+        arr = k2.Array(_arr)
+
+        tensor = arr.tensor()
+        assert tensor.ndim == 1
+        assert tensor.dtype == torch.int32
+        assert tensor.device.type == 'cpu'
+        assert tensor[0] == _arr.get(0)
+
+        # now we change the tensor, `_arr` should also be changed
+        # since they share the underlying memory
+
+        tensor[0] += 100
+        assert tensor[0] == _arr.get(0)
+
+        val = tensor[0]
+
+        del _arr, arr
+        assert tensor[0] == val, 'tensor should still be accessible'
+        del tensor
+
+    def test_cpu_float_array1_from_tensor(self):
+        gt_tensor = torch.tensor([1, 2, 3], dtype=torch.float)
+        array = k2.Array(gt_tensor)
+        actual_tensor = array.tensor()
+
+        assert actual_tensor.dtype == gt_tensor.dtype
+        assert actual_tensor.device == gt_tensor.device
+        assert actual_tensor.ndim == gt_tensor.ndim
+
+        assert torch.allclose(gt_tensor, actual_tensor)
+
+        gt_tensor += 100
+        assert torch.allclose(gt_tensor, actual_tensor), \
+                'actual_tensor should share the same memory with gt_tensor'
+
+        val = gt_tensor[0]
+        del gt_tensor, array
+
+        actual_tensor[0] += 1
+        val += 1
+        assert val == actual_tensor[0], \
+                'actual_tensor[0] should still be accessible'
+        del actual_tensor
+
+    def test_cuda_float_array1_to_tensor(self):
+        device_id = 0
+        _arr = _k2.get_cuda_float_array1(device_id)
+        arr = k2.Array(_arr)
+
+        tensor = arr.tensor()
+        assert tensor.ndim == 1
+        assert tensor.dtype == torch.float
+        assert tensor.device.type == 'cuda'
+        assert tensor.device.index == device_id
+        assert tensor[0] == _arr.get(0)
+
+        # now we change the tensor, `_arr` should also be changed
+        # since they share the underlying memory
+
+        tensor[0] += 100
+        assert tensor[0] == _arr.get(0)
+
+        val = tensor[0]
+
+        del _arr, arr
+        tensor[0] += 1
+        val += 1
+        assert tensor[0] == val, 'tensor should still be accessible'
+        del tensor
+
+    def test_cuda_int_array1_from_tensor(self):
+        device_id = 0
+        device = torch.device('cuda', device_id)
+        gt_tensor = torch.tensor([1, 2, 3], dtype=torch.int32).to(device)
+        array = k2.Array(gt_tensor)
+        actual_tensor = array.tensor()
+
+        assert actual_tensor.dtype == gt_tensor.dtype
+        assert actual_tensor.device == gt_tensor.device
+        assert actual_tensor.ndim == gt_tensor.ndim
+
+        assert torch.allclose(gt_tensor, actual_tensor)
+
+        gt_tensor += 100
+        assert torch.allclose(gt_tensor, actual_tensor), \
+                'actual_tensor should share the same memory with gt_tensor'
+
+        val = gt_tensor[0]
+        del gt_tensor, array
+
+        actual_tensor[0] += 1
+        val += 1
+        assert val == actual_tensor[0], \
+                'actual_tensor[0] should still be accessible'
+        del actual_tensor
 
 
 if __name__ == '__main__':
diff --git a/k2/python/tests/connect_test.py b/k2/python/tests/connect_test.py
deleted file mode 100644
index f1e5f200d..000000000
--- a/k2/python/tests/connect_test.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R connect_test_py
-#
-
-import unittest
-
-import torch
-
-import k2
-
-
-class TestConnection(unittest.TestCase):
-
-    def test_case_1(self):
-        # a non-connected, non-topsorted, acyclic input fsa;
-        # the output fsa is topsorted.
-        s = r'''
-        0 1 1
-        0 2 2
-        1 3 3
-        1 6 -1
-        2 4 2
-        2 6 -1
-        2 1 1
-        5 0 1
-        6
-        '''
-        fsa = k2.str_to_fsa(s)
-        connection = k2.Connection(fsa)
-        array_size = k2.IntArray2Size()
-        connection.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
-        status = connection.get_output(fsa_out, arc_map)
-        self.assertTrue(status)
-        expected_arc_indexes = torch.IntTensor([0, 2, 4, 5, 5])
-        expected_arcs = torch.IntTensor([[0, 2, 1], [0, 1, 2], [1, 3, -1],
-                                         [1, 2, 1], [2, 3, -1]])
-        expected_arc_map = torch.IntTensor([0, 1, 5, 6, 3])
-        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
-        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
-        self.assertTrue(torch.equal(arc_map.data, expected_arc_map))
-
-    def test_case_2(self):
-        # a cyclic input fsa
-        # after trimming, the cycle is removed;
-        # so the output fsa should be topsorted.
-        s = r'''
-        0 1 1
-        0 2 2
-        1 3 3
-        1 6 6
-        2 4 2
-        2 6 3
-        2 6 -1
-        5 0 1
-        5 7 -1
-        7
-        '''
-        fsa = k2.str_to_fsa(s)
-        connection = k2.Connection(fsa)
-        array_size = k2.IntArray2Size()
-        connection.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map = k2.IntArray1.create_array_with_size(array_size.size2)
-        status = connection.get_output(fsa_out, arc_map)
-        self.assertTrue(status)
-        self.assertTrue(k2.is_empty(fsa_out))
-        self.assertTrue(arc_map.empty())
-
-    def test_case_3(self):
-        # a non-connected, non-topsorted, acyclic input fsa;
-        # the output fsa is topsorted.
-        s = r'''
-        0 3 3
-        0 5 5
-        1 2 2
-        2 1 1
-        3 5 5
-        3 2 2
-        3 4 4
-        3 6 -1
-        4 5 5
-        4 6 -1
-        5 6 -1
-        6
-        '''
-        fsa = k2.str_to_fsa(s)
-        connection = k2.Connection(fsa)
-        array_size = k2.IntArray2Size()
-        connection.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        connection.get_output(fsa_out)
-        self.assertTrue(k2.is_top_sorted(fsa_out))
-
-    def test_case_4(self):
-        # a cyclic input fsa
-        # after trimming, the cycle remains (it is not a self-loop);
-        # so the output fsa is NOT topsorted.
-        s = r'''
-        0 3 3
-        0 2 2
-        1 0 1
-        2 6 -1
-        3 5 5
-        3 2 2
-        3 5 5
-        4 4 4
-        5 3 3
-        5 4 4
-        6
-        '''
-        fsa = k2.str_to_fsa(s)
-        connection = k2.Connection(fsa)
-        array_size = k2.IntArray2Size()
-        connection.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        status = connection.get_output(fsa_out)
-        self.assertFalse(status)
-        self.assertFalse(k2.is_top_sorted(fsa_out))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/k2/python/tests/determinize_test.py b/k2/python/tests/determinize_test.py
deleted file mode 100644
index 5f48b4e59..000000000
--- a/k2/python/tests/determinize_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R determinize_test_py
-#
-
-import unittest
-
-import torch
-
-import k2
-
-
-class TestDeterminize(unittest.TestCase):
-
-    def setUp(self):
-        s = r'''
-        0 4 1
-        0 1 1
-        1 2 2
-        1 3 3
-        2 7 1
-        3 7 1
-        4 6 1
-        4 6 1
-        4 5 1
-        4 8 -1
-        5 8 -1
-        6 8 -1
-        7 8 -1
-        8
-        '''
-        self.fsa = k2.str_to_fsa(s)
-        self.num_states = self.fsa.num_states()
-        weights = torch.FloatTensor([1, 1, 2, 3, 4, 5, 2, 3, 3, 2, 4, 3, 5])
-        self.weights = k2.FloatArray1(weights)
-
-    def test_max_weight(self):
-        forward_max_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        backward_max_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        wfsa = k2.WfsaWithFbWeights(self.fsa, self.weights,
-                                    k2.FbWeightType.kMaxWeight,
-                                    forward_max_weights, backward_max_weights)
-        beam = 10.0
-        determinizer = k2.DeterminizerMax(wfsa, beam, 100)
-        fsa_size = k2.IntArray2Size()
-        arc_derivs_size = k2.IntArray2Size()
-        determinizer.get_sizes(fsa_size, arc_derivs_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        arc_derivs = k2.IntArray2.create_array_with_size(arc_derivs_size)
-        arc_weights_out = k2.FloatArray1.create_array_with_size(fsa_size.size2)
-        determinizer.get_output(fsa_out, arc_weights_out, arc_derivs)
-        self.assertTrue(k2.is_deterministic(fsa_out))
-        self.assertEqual(fsa_out.size1, 7)
-        self.assertEqual(fsa_out.size2, 9)
-        self.assertEqual(arc_derivs.size1, 9)
-        self.assertEqual(arc_derivs.size2, 12)
-        self.assertTrue(
-            k2.is_rand_equivalent_max_weight(self.fsa, self.weights, fsa_out,
-                                             arc_weights_out, beam))
-
-    def test_logsum_weight(self):
-        forward_logsum_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        backward_logsum_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        wfsa = k2.WfsaWithFbWeights(self.fsa, self.weights,
-                                    k2.FbWeightType.kLogSumWeight,
-                                    forward_logsum_weights,
-                                    backward_logsum_weights)
-        beam = 10.0
-        determinizer = k2.DeterminizerLogSum(wfsa, beam, 100)
-        fsa_size = k2.IntArray2Size()
-        arc_derivs_size = k2.IntArray2Size()
-        determinizer.get_sizes(fsa_size, arc_derivs_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        arc_derivs = k2.LogSumArcDerivs.create_arc_derivs_with_size(
-            arc_derivs_size)
-        arc_weights_out = k2.FloatArray1.create_array_with_size(fsa_size.size2)
-        determinizer.get_output(fsa_out, arc_weights_out, arc_derivs)
-        self.assertTrue(k2.is_deterministic(fsa_out))
-        self.assertEqual(fsa_out.size1, 7)
-        self.assertEqual(fsa_out.size2, 9)
-        self.assertEqual(arc_derivs.size1, 9)
-        self.assertEqual(arc_derivs.size2, 15)
-        self.assertTrue(
-            k2.is_rand_equivalent_logsum_weight(self.fsa, self.weights,
-                                                fsa_out, arc_weights_out,
-                                                beam))
-        # cast float to int
-        arc_ids = k2.StridedIntArray1.from_float_tensor(arc_derivs.data[:, 0])
-        # we may get different value of `arc_ids.get_data(1)`
-        # with different STL implementations as we use
-        # `std::unordered_map` in implementation of determinize
-        # self.assertEqual(arc_ids.get_data(1), 9)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/k2/python/tests/intersect_test.py b/k2/python/tests/intersect_test.py
deleted file mode 100644
index aef91bcda..000000000
--- a/k2/python/tests/intersect_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R intersect_test_py
-#
-
-import unittest
-
-import torch
-
-import k2
-
-
-class TestIntersection(unittest.TestCase):
-
-    def test_case_1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa_a = k2.Fsa.create_fsa_with_size(array_size)
-        fsa_b = k2.Fsa.create_fsa_with_size(array_size)
-        intersection = k2.Intersection(fsa_a, fsa_b)
-        array_size = k2.IntArray2Size()
-        intersection.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map_a = k2.IntArray1.create_array_with_size(array_size.size2)
-        arc_map_b = k2.IntArray1.create_array_with_size(array_size.size2)
-        status = intersection.get_output(fsa_out, arc_map_a, arc_map_b)
-        self.assertTrue(status)
-        self.assertTrue(k2.is_empty(fsa_out))
-        self.assertTrue(arc_map_a.empty())
-        self.assertTrue(arc_map_b.empty())
-
-        # test without arc_map
-        status = intersection.get_output(fsa_out)
-        self.assertTrue(status)
-        self.assertTrue(k2.is_empty(fsa_out))
-
-    def test_case_2(self):
-        s_a = r'''
-        0 1 1
-        1 2 0
-        1 3 1
-        1 4 2
-        2 2 1
-        2 3 1
-        2 3 2
-        3 3 0
-        3 4 1
-        4
-        '''
-
-        fsa_a = k2.str_to_fsa(s_a)
-
-        s_b = r'''
-        0 1 1
-        1 3 1
-        1 2 2
-        2 3 1
-        3
-        '''
-
-        fsa_b = k2.str_to_fsa(s_b)
-        intersection = k2.Intersection(fsa_a, fsa_b)
-        array_size = k2.IntArray2Size()
-        intersection.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        arc_map_a = k2.IntArray1.create_array_with_size(array_size.size2)
-        arc_map_b = k2.IntArray1.create_array_with_size(array_size.size2)
-        status = intersection.get_output(fsa_out, arc_map_a, arc_map_b)
-        self.assertTrue(status)
-        expected_arc_indexes = torch.IntTensor([0, 1, 4, 7, 8, 8, 8, 10, 10])
-        expected_arcs = torch.IntTensor([[0, 1, 1], [1, 2, 0], [1, 3, 1],
-                                         [1, 4, 2], [2, 5, 1], [2, 3, 1],
-                                         [2, 6, 2], [3, 3, 0], [6, 6, 0],
-                                         [6, 7, 1]])
-        expected_arc_map_a = torch.IntTensor([0, 1, 2, 3, 4, 5, 6, 7, 7, 8])
-        expected_arc_map_b = torch.IntTensor([0, -1, 1, 2, 1, 1, 2, -1, -1, 3])
-        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
-        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
-        self.assertTrue(torch.equal(arc_map_a.data, expected_arc_map_a))
-        self.assertTrue(torch.equal(arc_map_b.data, expected_arc_map_b))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/k2/python/tests/properties_test.py b/k2/python/tests/properties_test.py
deleted file mode 100644
index f6edff5ca..000000000
--- a/k2/python/tests/properties_test.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R properties_test_py
-#
-
-import unittest
-
-import torch
-
-import k2
-
-
-class TestIsValid(unittest.TestCase):
-
-    def test_bad_case1(self):
-        # fsa should contain at least two states
-        array_size = k2.IntArray2Size(1, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertFalse(k2.is_valid(fsa))
-
-    def test_bad_case2(self):
-        # only kFinalSymbol arcs enter the final state
-        s = r'''
-        0 1 0
-        0 2 1
-        1 2 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_valid(fsa))
-
-    def test_bad_case3(self):
-        # `arc_indexes` and `arcs` in this state are not consistent
-        arc_indexes = torch.IntTensor([0, 2, 2, 2])
-        arcs = torch.IntTensor([[0, 1, 0], [0, 2, 1], [1, 2, 0]])
-        fsa = k2.Fsa(arc_indexes, arcs)
-        self.assertFalse(k2.is_valid(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa is valid
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_valid(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 0
-        0 2 0
-        2 3 -1
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_valid(fsa))
-
-    def test_good_case3(self):
-        s = r'''
-        0 1 0
-        0 2 -1
-        1 2 -1
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_valid(fsa))
-
-
-class TestIsTopSorted(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 1 0
-        0 2 0
-        2 1 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_top_sorted(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_top_sorted(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 0
-        0 2 0
-        1 2 0
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_top_sorted(fsa))
-
-
-class TestIsArcSorted(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 1 1
-        0 2 2
-        1 2 2
-        1 3 1
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_arc_sorted(fsa))
-
-    def test_bad_cases2(self):
-        # same label on two arcs
-        s = r'''
-        0 2 0
-        0 1 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_arc_sorted(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_arc_sorted(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 0
-        0 2 0
-        1 2 1
-        1 3 2
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_arc_sorted(fsa))
-
-
-class TestHasSelfLoops(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 1 0
-        0 2 0
-        1 2 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.has_self_loops(fsa))
-
-    def test_bad_cases2(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertFalse(k2.has_self_loops(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 0
-        1 2 0
-        1 1 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.has_self_loops(fsa))
-
-
-class TestIsDeterministic(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 1 2
-        1 2 0
-        1 3 0
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_deterministic(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_deterministic(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 2
-        1 2 0
-        1 3 2
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_deterministic(fsa))
-
-
-class TestIsEpsilonFree(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 1 2
-        0 2 0
-        1 2 1
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_epsilon_free(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_epsilon_free(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 2
-        0 2 1
-        1 2 1
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_epsilon_free(fsa))
-
-
-class TestIsConnected(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 2 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_connected(fsa))
-
-    def test_bad_cases2(self):
-        s = r'''
-        0 1 0
-        0 2 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_connected(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_connected(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 0
-        0 3 0
-        1 2 0
-        2 3 0
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_connected(fsa))
-
-    def test_good_case3(self):
-        s = r'''
-        0 3 0
-        1 2 0
-        2 3 0
-        2 3 0
-        2 4 0
-        3 1 0
-        4
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_connected(fsa))
-
-
-class TestIsAcyclic(unittest.TestCase):
-
-    def test_bad_cases1(self):
-        s = r'''
-        0 1 2
-        0 4 0
-        0 2 0
-        1 2 1
-        1 3 0
-        2 1 0
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_acyclic(fsa))
-
-    def test_good_cases1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_acyclic(fsa))
-
-    def test_good_case2(self):
-        s = r'''
-        0 1 2
-        0 2 1
-        1 2 0
-        1 3 5
-        2 3 6
-        3
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertTrue(k2.is_acyclic(fsa))
-
-
-class TestIsEmpty(unittest.TestCase):
-
-    def test_good_cases1(self):
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        self.assertTrue(k2.is_empty(fsa))
-
-    def test_bad_case1(self):
-        s = r'''
-        0 1 2
-        1
-        '''
-        fsa = k2.str_to_fsa(s)
-        self.assertFalse(k2.is_empty(fsa))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/k2/python/tests/rmepsilon_test.py b/k2/python/tests/rmepsilon_test.py
deleted file mode 100644
index f5b05fde8..000000000
--- a/k2/python/tests/rmepsilon_test.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R rmepsilon_test_py
-#
-
-from struct import pack, unpack
-import unittest
-
-import torch
-
-import k2
-
-
-class TestRmEpsilon(unittest.TestCase):
-
-    def setUp(self):
-        s = r'''
-        0 4 1
-        0 1 1
-        1 2 0
-        1 3 0
-        1 4 0
-        2 7 0
-        3 7 0
-        4 6 1
-        4 6 0
-        4 8 1
-        4 9 -1
-        5 9 -1
-        6 9 -1
-        7 9 -1
-        8 9 -1
-        9
-        '''
-        self.fsa = k2.str_to_fsa(s)
-        self.num_states = self.fsa.num_states()
-        weights = torch.FloatTensor(
-            [1, 1, 2, 3, 2, 4, 5, 2, 3, 3, 2, 4, 3, 5, 6])
-        self.weights = k2.FloatArray1(weights)
-
-    def test_max_weight(self):
-        forward_max_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        backward_max_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        wfsa = k2.WfsaWithFbWeights(self.fsa, self.weights,
-                                    k2.FbWeightType.kMaxWeight,
-                                    forward_max_weights, backward_max_weights)
-        beam = 8.0
-        remover = k2.EpsilonsRemoverMax(wfsa, beam)
-        fsa_size = k2.IntArray2Size()
-        arc_derivs_size = k2.IntArray2Size()
-        remover.get_sizes(fsa_size, arc_derivs_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        arc_derivs = k2.IntArray2.create_array_with_size(arc_derivs_size)
-        arc_weights_out = k2.FloatArray1.create_array_with_size(fsa_size.size2)
-        remover.get_output(fsa_out, arc_weights_out, arc_derivs)
-        self.assertTrue(k2.is_epsilon_free(fsa_out))
-        self.assertEqual(fsa_out.size1, 6)
-        self.assertEqual(fsa_out.size2, 11)
-        self.assertEqual(arc_derivs.size1, 11)
-        self.assertEqual(arc_derivs.size2, 18)
-        self.assertTrue(
-            k2.is_rand_equivalent_max_weight(self.fsa, self.weights, fsa_out,
-                                             arc_weights_out, beam))
-
-    def test_logsum_weight(self):
-        forward_logsum_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        backward_logsum_weights = k2.DoubleArray1.create_array_with_size(
-            self.num_states)
-        wfsa = k2.WfsaWithFbWeights(self.fsa, self.weights,
-                                    k2.FbWeightType.kLogSumWeight,
-                                    forward_logsum_weights,
-                                    backward_logsum_weights)
-        beam = 8.0
-        remover = k2.EpsilonsRemoverLogSum(wfsa, beam)
-        fsa_size = k2.IntArray2Size()
-        arc_derivs_size = k2.IntArray2Size()
-        remover.get_sizes(fsa_size, arc_derivs_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(fsa_size)
-        arc_derivs = k2.LogSumArcDerivs.create_arc_derivs_with_size(
-            arc_derivs_size)
-        arc_weights_out = k2.FloatArray1.create_array_with_size(fsa_size.size2)
-        remover.get_output(fsa_out, arc_weights_out, arc_derivs)
-        self.assertTrue(k2.is_epsilon_free(fsa_out))
-        self.assertEqual(fsa_out.size1, 6)
-        self.assertEqual(fsa_out.size2, 11)
-        self.assertEqual(arc_derivs.size1, 11)
-        self.assertEqual(arc_derivs.size2, 20)
-        self.assertTrue(
-            k2.is_rand_equivalent_after_rmeps_pruned_logsum(
-                self.fsa, self.weights, fsa_out, arc_weights_out, beam))
-        # cast float to int
-        arc_ids = k2.StridedIntArray1.from_float_tensor(arc_derivs.data[:, 0])
-        # we may get different value of `arc_ids.get_data(1)`
-        # with different STL implementations as we use
-        # `std::unordered_map` in implementation of rmepsilon,
-        # thus below assertion may fail on some platforms.
-        self.assertEqual(arc_ids.get_data(1), 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/k2/python/tests/topsort_test.py b/k2/python/tests/topsort_test.py
deleted file mode 100644
index 69d9a06e3..000000000
--- a/k2/python/tests/topsort_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c)  2020  Xiaomi Corporation (author: Haowen Qiu)
-#
-# See ../../../LICENSE for clarification regarding multiple authors
-
-# To run this single test, use
-#
-#  ctest --verbose -R topsort_test_py
-#
-
-import unittest
-
-import torch
-
-import k2
-
-
-class TestTopSorter(unittest.TestCase):
-
-    def test_case_1(self):
-        # empty fsa
-        array_size = k2.IntArray2Size(0, 0)
-        fsa = k2.Fsa.create_fsa_with_size(array_size)
-        sorter = k2.TopSorter(fsa)
-        array_size = k2.IntArray2Size()
-        sorter.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        state_map = k2.IntArray1.create_array_with_size(array_size.size1)
-        status = sorter.get_output(fsa_out, state_map)
-        self.assertTrue(status)
-        self.assertTrue(k2.is_empty(fsa_out))
-        self.assertTrue(state_map.empty())
-
-        # test without arc_map
-        sorter.get_output(fsa_out)
-        self.assertTrue(k2.is_empty(fsa_out))
-
-    def test_case_2(self):
-        # non-connected fsa (not co-accessible)
-        s = r'''
-        0 2 -1
-        1 2 -1
-        1 2 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        sorter = k2.TopSorter(fsa)
-        array_size = k2.IntArray2Size()
-        sorter.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        state_map = k2.IntArray1.create_array_with_size(array_size.size1)
-        status = sorter.get_output(fsa_out, state_map)
-        self.assertFalse(status)
-        self.assertTrue(k2.is_empty(fsa_out))
-        self.assertTrue(state_map.empty())
-
-    def test_case_3(self):
-        # non-connected fsa (not accessible)
-        s = r'''
-        0 2 -1
-        1 0 1
-        1 2 0
-        2
-        '''
-        fsa = k2.str_to_fsa(s)
-        sorter = k2.TopSorter(fsa)
-        array_size = k2.IntArray2Size()
-        sorter.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        state_map = k2.IntArray1.create_array_with_size(array_size.size1)
-        status = sorter.get_output(fsa_out, state_map)
-        self.assertFalse(status)
-        self.assertTrue(k2.is_empty(fsa_out))
-        self.assertTrue(state_map.empty())
-
-    def test_case_4(self):
-        # connected fsa
-        s = r'''
-        0 4 40
-        0 2 20
-        1 6 -1
-        2 3 30
-        3 6 -1
-        3 1 10
-        4 5 50
-        5 2 8
-        6
-        '''
-        fsa = k2.str_to_fsa(s)
-        sorter = k2.TopSorter(fsa)
-        array_size = k2.IntArray2Size()
-        sorter.get_sizes(array_size)
-        fsa_out = k2.Fsa.create_fsa_with_size(array_size)
-        state_map = k2.IntArray1.create_array_with_size(array_size.size1)
-        status = sorter.get_output(fsa_out, state_map)
-        self.assertTrue(status)
-        expected_arc_indexes = torch.IntTensor([0, 2, 3, 4, 5, 7, 8, 8])
-        expected_arcs = torch.IntTensor([[0, 1, 40], [0, 3, 20], [1, 2, 50],
-                                         [2, 3, 8], [3, 4, 30], [4, 6, -1],
-                                         [4, 5, 10], [5, 6, -1]])
-        expected_state_map = torch.IntTensor([0, 4, 5, 2, 3, 1, 6])
-        self.assertTrue(torch.equal(fsa_out.indexes, expected_arc_indexes))
-        self.assertTrue(torch.equal(fsa_out.data, expected_arcs))
-        self.assertTrue(torch.equal(state_map.data, expected_state_map))
-
-
-if __name__ == '__main__':
-    unittest.main()