KTH-RPL · Kin-Zhang · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025 · Aug 16, 2025
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ It is also an official implementation of the following papers (sorted by the tim
 - **DeltaFlow: An Efficient Multi-frame Scene Flow Estimation Method**   
 *Qingwen Zhang, Xiaomeng Zhu, Yushan Zhang, Yixi Cai, Olov Andersson, Patric Jensfelt*  
 Conference on Neural Information Processing Systems (**NeurIPS**) 2025 - Spotlight   
-[ Backbone ] [ Supervised ] - [ [arXiv](https://arxiv.org/abs/2508.17054) ] [ [Project](https://github.com/Kin-Zhang/DeltaFlow) ]
+[ Backbone ] [ Supervised ] - [ [arXiv](https://arxiv.org/abs/2508.17054) ] [ [Project](https://github.com/Kin-Zhang/DeltaFlow) ]&rarr; [here](#deltaflow)
 
 - **HiMo: High-Speed Objects Motion Compensation in Point Clouds** (SeFlow++)   
 *Qingwen Zhang, Ajinkya Khoche, Yi Yang, Li Ling, Sina Sharif Mansouri, Olov Andersson, Patric Jensfelt*  
@@ -132,6 +132,18 @@ conda activate opensf
 
 ### Supervised Training
 
+#### DeltaFlow
+
+Train DeltaFlow with the leaderboard submit config. [Runtime: Around 18 hours in 10x RTX 3080 GPUs.]
+
+```bash
+# total bz then it's 10x2 under above training setup.
+python train.py model=deltaFlow optimizer.lr=2e-3 epochs=20 batch_size=2 num_frames=5 loss_fn=deflowLoss "voxel_size=[0.15, 0.15, 0.15]" "point_cloud_range=[-38.4, -38.4, -3.2, 38.4, 38.4, 3.2]" +optimizer.scheduler.name=WarmupCosLR +optimizer.scheduler.max_lr=2e-3 +optimizer.scheduler.total_steps=20000
+
+# Pretrained weight can be downloaded through:
+wget https://huggingface.co/kin-zhang/OpenSceneFlow/resolve/main/flow4d_best.ckpt
+```
+
 #### Flow4D
 
 Train Flow4D with the leaderboard submit config. [Runtime: Around 18 hours in 4x RTX 3090 GPUs.]
@@ -234,7 +246,7 @@ Since in training, we save all hyper-parameters and model checkpoints, the only
 python eval.py checkpoint=/home/kin/seflow_best.ckpt data_mode=val
 
 # (optimization-based): it might need take really long time, maybe tmux for run it.
-python eval.py model=nsfp
+python eval.py model=nsfp +master_port=12344 # change diff port if you want to have multiple runners.
 
 # it will output the av2_submit.zip or av2_submit_v2.zip for you to submit to leaderboard
 python eval.py checkpoint=/home/kin/seflow_best.ckpt data_mode=test leaderboard_version=1

diff --git a/assets/cuda/README.md b/assets/cuda/README.md
@@ -5,6 +5,8 @@ Faster our code in CUDA.
 
 - chamfer3D: 3D chamfer distance within two point cloud, by Qingwen Zhang involved when she was working on SeFlow.
 - mmcv: directly from mmcv, not our code.
+- mmdet: only python file, no need to compile
+- histlib: from Yancong's [ICP-Flow](https://github.com/yanconglin/ICP-Flow) project.
 
 ---
 

diff --git a/assets/cuda/histlib/__init__.py b/assets/cuda/histlib/__init__.py
@@ -0,0 +1,71 @@
+from torch import nn
+from torch.autograd import Function
+import torch
+import importlib
+
+import os, time
+import hist
+
+def histf(X, Y, min_x, min_y, min_z, max_x, max_y, max_z, len_x, len_y, len_z, mini_batch=8):
+    # print('hist cuda params: ', X.shape, Y.shape,
+    #       min_x, min_y, min_z,
+    #       max_x, max_y, max_z,
+    #       len_x, len_y, len_z,
+    #       )
+    histogram = hist.hist(X.contiguous(), Y.contiguous(), 
+                          min_x, min_y, min_z,
+                          max_x, max_y, max_z, 
+                          len_x, len_y, len_z, 
+                          mini_batch
+                          )
+    return histogram
+
+
+torch.manual_seed(2022)
+
+########################
+def run_test():
+    pts = torch.randn(3, 1000, 3)
+    indicators = torch.randint(0, 2, size=(3, 1000, 1))
+    pts1 = torch.cat([pts, indicators], dim=-1)
+    pts2  = pts1.clone()
+    pts2[:, :,0] += 5.
+    pts2[:, :,1] += -3.
+    pts2[:, :,2] += -0.2
+
+    range_x = 10.
+    range_y = 10.
+    range_z = 0.5
+    thres =0.1
+    # bins_x = torch.linspace(-range_x, range_x, int(2*range_x/thres)+1)
+    # bins_y = torch.linspace(-range_y, range_y, int(2*range_y/thres)+1)
+    # bins_z = torch.linspace(-range_z, range_z, int(2*range_z/thres)+1)
+    bins_x = torch.arange(-range_x, range_x+thres, thres)
+    bins_y = torch.arange(-range_y, range_y+thres, thres)
+    bins_z = torch.arange(-range_z, range_z+thres, thres)
+    print('bins_x: ', bins_x)
+    print('bins_z: ', bins_z)
+    pts1 = pts1.cuda()
+    pts2 = pts2.cuda()
+    bins_x = bins_x.cuda()
+    bins_y = bins_y.cuda()
+    bins_z = bins_z.cuda()
+
+    t_hists = histf(pts1, pts2, 
+               -range_x, -range_y, -range_z,
+               range_x, range_y, range_z,
+               len(bins_x), len(bins_y), len(bins_z),
+               )
+    print('output shape: ', t_hists.shape)
+    b, h, w, d = t_hists.shape
+    for t_hist in t_hists:
+        t_argmax = torch.argmax(t_hist)
+        print(f't_argmax: {t_argmax}, {t_hist.max()} {h}, {w}, {d}, {t_argmax//d//w%h}, {t_argmax//d%w}, {t_argmax%d}')
+        print('t_argmax', t_argmax//d//w%h, t_argmax//d%w, t_argmax%d, bins_x[t_argmax//d//w%h], bins_y[t_argmax//d%w], bins_z[t_argmax%d])
+
+if __name__ == '__main__':
+
+    print("Pytorch version: ", torch.__version__)
+    print("GPU version: ", torch.cuda.get_device_name())
+
+    run_test()
diff --git a/assets/cuda/histlib/hist.cu b/assets/cuda/histlib/hist.cu
@@ -0,0 +1,90 @@
+#include <vector>
+#include "hist_cuda_core.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// #include <THC/THC.h>
+// #include <THC/THCAtomics.cuh>
+// #include <THC/THCDeviceUtils.cuh>
+
+// extern THCState *state;
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+
+at::Tensor
+hist_cuda(const at::Tensor &X, const at::Tensor &Y,
+        const float min_x, const float min_y, const float min_z,
+        const float max_x, const float max_y, const float max_z,
+        const int len_x, const int len_y, const int len_z,
+        const int mini_batch
+            )
+{
+    // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
+
+    AT_ASSERTM(X.is_contiguous(), "input tensor has to be contiguous");
+    AT_ASSERTM(Y.is_contiguous(), "input tensor has to be contiguous");
+
+    AT_ASSERTM(X.type().is_cuda(), "input must be a CUDA tensor");
+    AT_ASSERTM(Y.type().is_cuda(), "input must be a CUDA tensor");
+
+    const int batch = X.size(0); 
+    const int num_X = X.size(1);
+    const int dim = X.size(2);
+    const int num_Y = Y.size(1);
+
+    AT_ASSERTM((X.size(0) == Y.size(0)), "batch_X (%d) != batch_Y (%d).", X.size(0), Y.size(0));
+    AT_ASSERTM((X.size(2) == Y.size(2)), "dim_X (%d) != dim_Y (%d).", X.size(2), Y.size(2));
+
+    AT_ASSERTM((dim == 4), "dim (%d) != 4; 3 for (x, y, z); 1 for indicator,padded or not.", dim);
+
+    // printf("len: %d %d %f \n", len_x, len_y, len_z);
+    // printf("hist cuda coord: %f, %f, %f; %f, %f, %f; %f, %f, %f. \n", val_x, val_y, val_z, p_x, p_y, p_z, len_x, len_y, len_z);
+
+    // auto bins = at::zeros({batch, len_x, len_y, len_z}, X.options());
+    // AT_DISPATCH_FLOATING_TYPES(X.type(), "hist_cuda_core", ([&] {
+    //     hist_cuda_core(at::cuda::getCurrentCUDAStream(),
+    //                                     X.data<scalar_t>(), Y.data<scalar_t>(),
+    //                                     batch, dim, num_X, num_Y,
+    //                                     min_x, min_y, min_z, 
+    //                                     max_x, max_y, max_z, 
+    //                                     len_x, len_y, len_z, 
+    //                                     bins.data<scalar_t>());
+    //     }));
+
+    auto bins = at::zeros({batch, len_x, len_y, len_z}, X.options());
+
+    int iters = batch / mini_batch; 
+    if (batch % mini_batch != 0) 
+    { 
+        iters += 1; 
+    }
+
+    for (int i=0; i<iters; ++i)
+    {
+        int mini_batch_ = mini_batch;
+        if ((i+1) * mini_batch > batch) 
+        {
+            mini_batch_ = batch - i * mini_batch; 
+        }
+        // printf("iter: %d %d %d %d %d \n", i, iters, mini_batch_, mini_batch, batch);
+        AT_DISPATCH_FLOATING_TYPES(X.type(), "hist_cuda_core", ([&] {
+            hist_cuda_core(at::cuda::getCurrentCUDAStream(),
+                                            X.data<scalar_t>() + i*mini_batch*num_X*dim, 
+                                            Y.data<scalar_t>() + i*mini_batch*num_Y*dim, 
+                                            mini_batch_, dim, num_X, num_Y,
+                                            min_x, min_y, min_z, 
+                                            max_x, max_y, max_z, 
+                                            len_x, len_y, len_z, 
+                                            bins.data<scalar_t>()+i*mini_batch*len_x*len_y*len_z);
+            }));
+    }
+
+
+
+    return bins;
+}
diff --git a/assets/cuda/histlib/hist.h b/assets/cuda/histlib/hist.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+hist(const at::Tensor &X, const at::Tensor &Y,
+    const float min_x, const float min_y, const float min_z,
+    const float max_x, const float max_y, const float max_z,
+    const int len_x, const int len_y, const int len_z,
+    const int mini_batch
+    );
+
+
+
diff --git a/assets/cuda/histlib/hist_cuda.cpp b/assets/cuda/histlib/hist_cuda.cpp
@@ -0,0 +1,27 @@
+#include "hist.h"
+#include "hist_cuda.h"
+
+at::Tensor
+hist(const at::Tensor &X, const at::Tensor &Y,
+        const float min_x, const float min_y, const float min_z,
+        const float max_x, const float max_y, const float max_z,
+        const int len_x, const int len_y, const int len_z,
+        const int mini_batch
+        )
+{
+
+    if (X.type().is_cuda() && Y.type().is_cuda())
+    {
+        return hist_cuda(X, Y,
+                        min_x, min_y, min_z, 
+                        max_x, max_y, max_z, 
+                        len_x, len_y, len_z,
+                        mini_batch
+                        );
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("hist", &hist, "hist");
+}
diff --git a/assets/cuda/histlib/hist_cuda.h b/assets/cuda/histlib/hist_cuda.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+hist_cuda(const at::Tensor &X, const at::Tensor &Y,
+        const float min_x, const float min_y, const float min_z,
+        const float max_x, const float max_y, const float max_z,
+        const int len_x, const int len_y, const int len_z,
+        const int mini_batch
+        );
diff --git a/assets/cuda/histlib/hist_cuda_core.cuh b/assets/cuda/histlib/hist_cuda_core.cuh
@@ -0,0 +1,100 @@
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+// #include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+// #include <THC/THCDeviceUtils.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename scalar_t>
+__global__ void hist_cuda_kernel(const int n,
+                                              const scalar_t* X,
+                                              const scalar_t* Y,
+                                              const int batch, const int dim, 
+                                              const int num_X, const int num_Y, 
+                                              const float min_x, const float min_y, const float min_z, 
+                                              const float max_x, const float max_y, const float max_z, 
+                                              const int len_x, const int len_y, const int len_z,
+                                              scalar_t* bins
+                                              )
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    // launch in parallel:  batch * numX * numY;
+    // printf("hist cuda bin size: %d, %d, %d, %d. \n", batch, len_x, len_y, len_z);
+    const int b = index / num_X / num_Y % batch;
+    const int i = index / num_Y % num_X;
+    const int j = index  % num_Y;
+
+    scalar_t flag_x = X[b*num_X*dim+i*dim+3];
+    scalar_t flag_y = Y[b*num_Y*dim+j*dim+3];
+    if (flag_x>0.0 && flag_y>0.0)
+    {
+      scalar_t val_x = X[b*num_X*dim+i*dim+0] - Y[b*num_Y*dim+j*dim+0];
+      scalar_t val_y = X[b*num_X*dim+i*dim+1] - Y[b*num_Y*dim+j*dim+1];
+      scalar_t val_z = X[b*num_X*dim+i*dim+2] - Y[b*num_Y*dim+j*dim+2];
+      if (val_x >= min_x && val_x < max_x && val_y >= min_y && val_y < max_y && val_z >= min_z && val_z < max_z)
+      {
+        // [): left included; right excluded.
+        int p_x = __float2int_rd( (val_x-min_x) / (max_x-min_x) * __int2float_rd(len_x));
+        int p_y = __float2int_rd( (val_y-min_y) / (max_y-min_y) * __int2float_rd(len_y));
+        int p_z = __float2int_rd( (val_z-min_z) / (max_z-min_z) * __int2float_rd(len_z));
+
+        // printf("hist cuda coord: %d, %d, %d, %d; %d, %d, %d, %d. \n", batch, len_x, len_y, len_z, b, p_x, p_y, p_z);
+        int bin_id = b*len_x*len_y*len_z + p_x*len_y*len_z  +  p_y*len_z + p_z;
+        atomicAdd(bins + bin_id, 1);
+      }
+  }
+  }
+}
+
+template <typename scalar_t>
+void hist_cuda_core(cudaStream_t stream,
+                              const scalar_t* X, const scalar_t* Y,
+                              const int batch, const int dim, 
+                              const int num_X, const int num_Y, 
+                              const float min_x, const float min_y, const float min_z, 
+                              const float max_x, const float max_y, const float max_z, 
+                              const int len_x, const int len_y, const int len_z,
+                              scalar_t* bins
+                              ) 
+{
+  const int num_kernels = batch * num_X * num_Y;
+  // printf("num kernels: %d\n", num_kernels);
+
+  // printf("hist cuda core: %f, %f, %f; %f, %f, %f; %f, %f, %f. \n", min_x, min_y, min_z, max_x, max_y, max_z, len_x, len_y, len_z);
+  // printf("hist cuda core: ", min_x, min_y, min_z, max_x, max_y, max_z, len_x, len_y, len_z, " \n");
+  hist_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
+      num_kernels, 
+      X, Y,
+      batch, dim,
+      num_X, num_Y,
+      min_x, min_y, min_z, 
+      max_x, max_y, max_z, 
+      len_x, len_y, len_z, 
+      bins
+      );
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in hist_cuda_core: %s\n", cudaGetErrorString(err));
+  }
+}
+
diff --git a/assets/cuda/histlib/setup.py b/assets/cuda/histlib/setup.py
@@ -0,0 +1,15 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='hist',
+    ext_modules=[
+        CUDAExtension('hist', [
+            "/".join(__file__.split('/')[:-1] + ['hist_cuda.cpp']), # must named as xxx_cuda.cpp
+            "/".join(__file__.split('/')[:-1] + ['hist.cu']),
+        ]),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    },
+    version='1.0.1')
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -28,8 +28,8 @@ gradient_clip_val: 5.0
 # optimizer ==> Adam
 optimizer:
   name: Adam # [Adam, AdamW]
-  lr: 1e-4
-loss_fn: seflowLoss # choices: [ff3dLoss, zeroflowLoss, deflowLoss, seflowLoss]
+  lr: 2e-4
+loss_fn: deflowLoss # choices: [ff3dLoss, zeroflowLoss, deflowLoss, seflowLoss]
 # add_seloss: {chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}
 # ssl_label: