Skip to content

Commit

Permalink
Improve speed of transformPointCloud/WithNormals() functions by SSE2/…
Browse files Browse the repository at this point in the history
…AVX intrinsics
  • Loading branch information
taketwo committed Mar 19, 2018
1 parent fc88d45 commit 04ae840
Showing 1 changed file with 194 additions and 76 deletions.
270 changes: 194 additions & 76 deletions common/include/pcl/common/impl/transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,179 @@
*
*/

#if defined(__SSE2__)
#include <xmmintrin.h>
#endif

#if defined(__AVX__)
#include <immintrin.h>
#endif

namespace pcl
{

namespace detail
{

/** A helper struct to apply an SO3 or SE3 transform to a 3D point.
* Supports single and double precision transform matrices. */
template<typename Scalar>
struct Transformer
{
const Eigen::Matrix<Scalar, 4, 4>& tf;

/** Construct a transformer object.
* The transform matrix is captured by const reference. Make sure that it does not go out of scope before this
* object does. */
Transformer (const Eigen::Matrix<Scalar, 4, 4>& transform) : tf (transform) { };

/** Apply SO3 transform (top-left corner of the transform matrix).
* \param[in] src input 3D point (pointer to 3 floats)
* \param[out] tgt output 3D point (pointer to 4 floats), can be the same as input. The fourth element is set to 0. */
void so3 (const float* src, float* tgt) const
{
const Scalar p[3] = { src[0], src[1], src[2] }; // need this when src == tgt
tgt[0] = static_cast<float> (tf (0, 0) * p[0] + tf (0, 1) * p[1] + tf (0, 2) * p[2]);
tgt[1] = static_cast<float> (tf (1, 0) * p[0] + tf (1, 1) * p[1] + tf (1, 2) * p[2]);
tgt[2] = static_cast<float> (tf (2, 0) * p[0] + tf (2, 1) * p[1] + tf (2, 2) * p[2]);
tgt[3] = 0;
}

/** Apply SE3 transform.
* \param[in] src input 3D point (pointer to 3 floats)
* \param[out] tgt output 3D point (pointer to 4 floats), can be the same as input. The fourth element is set to 1. */
void se3 (const float* src, float* tgt) const
{
const Scalar p[3] = { src[0], src[1], src[2] }; // need this when src == tgt
tgt[0] = static_cast<float> (tf (0, 0) * p[0] + tf (0, 1) * p[1] + tf (0, 2) * p[2] + tf (0, 3));
tgt[1] = static_cast<float> (tf (1, 0) * p[0] + tf (1, 1) * p[1] + tf (1, 2) * p[2] + tf (1, 3));
tgt[2] = static_cast<float> (tf (2, 0) * p[0] + tf (2, 1) * p[1] + tf (2, 2) * p[2] + tf (2, 3));
tgt[3] = 1;
}
};

#if defined(__SSE2__)

/** Optimized version for single-precision transforms using SSE2 intrinsics. */
template<>
struct Transformer<float>
{
/// Columns of the transform matrix stored in XMM registers.
__m128 c[4];

Transformer(const Eigen::Matrix4f& tf)
{
for (size_t i = 0; i < 4; ++i)
c[i] = _mm_load_ps (tf.col (i).data ());
}

void so3 (const float* src, float* tgt) const
{
__m128 p0 = _mm_mul_ps (_mm_load_ps1 (&src[0]), c[0]);
__m128 p1 = _mm_mul_ps (_mm_load_ps1 (&src[1]), c[1]);
__m128 p2 = _mm_mul_ps (_mm_load_ps1 (&src[2]), c[2]);
_mm_store_ps (tgt, _mm_add_ps(p0, _mm_add_ps(p1, p2)));
}

void se3 (const float* src, float* tgt) const
{
__m128 p0 = _mm_mul_ps (_mm_load_ps1 (&src[0]), c[0]);
__m128 p1 = _mm_mul_ps (_mm_load_ps1 (&src[1]), c[1]);
__m128 p2 = _mm_mul_ps (_mm_load_ps1 (&src[2]), c[2]);
_mm_store_ps (tgt, _mm_add_ps(p0, _mm_add_ps(p1, _mm_add_ps(p2, c[3]))));
}
};

#if not defined(__AVX__)

/** Optimized version for double-precision transform using SSE2 intrinsics. */
template<>
struct Transformer<double>
{
/// Columns of the transform matrix stored in XMM registers.
__m128d c[4][2];

Transformer(const Eigen::Matrix4d& tf)
{
for (size_t i = 0; i < 4; ++i)
{
c[i][0] = _mm_load_pd (tf.col (i).data () + 0);
c[i][1] = _mm_load_pd (tf.col (i).data () + 2);
}
}

void so3 (const float* src, float* tgt) const
{
__m128d xx = _mm_cvtps_pd (_mm_load_ps1 (&src[0]));
__m128d p0 = _mm_mul_pd (xx, c[0][0]);
__m128d p1 = _mm_mul_pd (xx, c[0][1]);

for (size_t i = 1; i < 3; ++i)
{
__m128d vv = _mm_cvtps_pd (_mm_load_ps1 (&src[i]));
p0 = _mm_add_pd (_mm_mul_pd (vv, c[i][0]), p0);
p1 = _mm_add_pd (_mm_mul_pd (vv, c[i][1]), p1);
}

_mm_store_ps (tgt, _mm_movelh_ps (_mm_cvtpd_ps (p0), _mm_cvtpd_ps (p1)));
}

void se3 (const float* src, float* tgt) const
{
__m128d p0 = c[3][0];
__m128d p1 = c[3][1];

for (size_t i = 0; i < 3; ++i)
{
__m128d vv = _mm_cvtps_pd (_mm_load_ps1 (&src[i]));
p0 = _mm_add_pd (_mm_mul_pd (vv, c[i][0]), p0);
p1 = _mm_add_pd (_mm_mul_pd (vv, c[i][1]), p1);
}

_mm_store_ps (tgt, _mm_movelh_ps (_mm_cvtpd_ps (p0), _mm_cvtpd_ps (p1)));
}

};

#else

/** Optimized version for double-precision transform using AVX intrinsics. */
template<>
struct Transformer<double>
{
__m256d c[4];

Transformer(const Eigen::Matrix4d& tf)
{
for (size_t i = 0; i < 4; ++i)
c[i] = _mm256_load_pd (tf.col (i).data ());
}

void so3 (const float* src, float* tgt) const
{
__m256d p0 = _mm256_mul_pd (_mm256_cvtps_pd (_mm_load_ps1 (&src[0])), c[0]);
__m256d p1 = _mm256_mul_pd (_mm256_cvtps_pd (_mm_load_ps1 (&src[1])), c[1]);
__m256d p2 = _mm256_mul_pd (_mm256_cvtps_pd (_mm_load_ps1 (&src[2])), c[2]);
_mm_store_ps (tgt, _mm256_cvtpd_ps (_mm256_add_pd(p0, _mm256_add_pd(p1, p2))));
}

void se3 (const float* src, float* tgt) const
{
__m256d p0 = _mm256_mul_pd (_mm256_cvtps_pd (_mm_load_ps1 (&src[0])), c[0]);
__m256d p1 = _mm256_mul_pd (_mm256_cvtps_pd (_mm_load_ps1 (&src[1])), c[1]);
__m256d p2 = _mm256_mul_pd (_mm256_cvtps_pd (_mm_load_ps1 (&src[2])), c[2]);
_mm_store_ps (tgt, _mm256_cvtpd_ps (_mm256_add_pd(p0, _mm256_add_pd(p1, _mm256_add_pd(p2, c[3])))));
}

};

#endif
#endif

}

}

///////////////////////////////////////////////////////////////////////////////////////////
template <typename PointT, typename Scalar> void
pcl::transformPointCloud (const pcl::PointCloud<PointT> &cloud_in,
Expand All @@ -59,17 +232,12 @@ pcl::transformPointCloud (const pcl::PointCloud<PointT> &cloud_in,
cloud_out.sensor_origin_ = cloud_in.sensor_origin_;
}

pcl::detail::Transformer<Scalar> tf (transform.matrix ());
if (cloud_in.is_dense)
{
// If the dataset is dense, simply transform it!
for (size_t i = 0; i < cloud_out.points.size (); ++i)
{
//cloud_out.points[i].getVector3fMap () = transform * cloud_in.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[i].x, cloud_in[i].y, cloud_in[i].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));
}
tf.se3 (cloud_in[i].data, cloud_out[i].data);
}
else
{
Expand All @@ -81,11 +249,7 @@ pcl::transformPointCloud (const pcl::PointCloud<PointT> &cloud_in,
!pcl_isfinite (cloud_in.points[i].y) ||
!pcl_isfinite (cloud_in.points[i].z))
continue;
//cloud_out.points[i].getVector3fMap () = transform * cloud_in.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[i].x, cloud_in[i].y, cloud_in[i].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));
tf.se3 (cloud_in[i].data, cloud_out[i].data);
}
}
}
Expand All @@ -108,6 +272,7 @@ pcl::transformPointCloud (const pcl::PointCloud<PointT> &cloud_in,
cloud_out.sensor_orientation_ = cloud_in.sensor_orientation_;
cloud_out.sensor_origin_ = cloud_in.sensor_origin_;

pcl::detail::Transformer<Scalar> tf (transform.matrix ());
if (cloud_in.is_dense)
{
// If the dataset is dense, simply transform it!
Expand All @@ -116,11 +281,7 @@ pcl::transformPointCloud (const pcl::PointCloud<PointT> &cloud_in,
// Copy fields first, then transform xyz data
if (copy_all_fields)
cloud_out.points[i] = cloud_in.points[indices[i]];
//cloud_out.points[i].getVector3fMap () = transform*cloud_out.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[indices[i]].x, cloud_in[indices[i]].y, cloud_in[indices[i]].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));
tf.se3 (cloud_in[indices[i]].data, cloud_out[i].data);
}
}
else
Expand All @@ -135,11 +296,7 @@ pcl::transformPointCloud (const pcl::PointCloud<PointT> &cloud_in,
!pcl_isfinite (cloud_in.points[indices[i]].y) ||
!pcl_isfinite (cloud_in.points[indices[i]].z))
continue;
//cloud_out.points[i].getVector3fMap () = transform*cloud_out.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[indices[i]].x, cloud_in[indices[i]].y, cloud_in[indices[i]].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));
tf.se3 (cloud_in[indices[i]].data, cloud_out[i].data);
}
}
}
Expand Down Expand Up @@ -167,23 +324,14 @@ pcl::transformPointCloudWithNormals (const pcl::PointCloud<PointT> &cloud_in,
cloud_out.sensor_origin_ = cloud_in.sensor_origin_;
}

pcl::detail::Transformer<Scalar> tf (transform.matrix ());
// If the data is dense, we don't need to check for NaN
if (cloud_in.is_dense)
{
for (size_t i = 0; i < cloud_out.points.size (); ++i)
{
//cloud_out.points[i].getVector3fMap() = transform * cloud_in.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[i].x, cloud_in[i].y, cloud_in[i].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));

// Rotate normals (WARNING: transform.rotation () uses SVD internally!)
//cloud_out.points[i].getNormalVector3fMap() = transform.rotation () * cloud_in.points[i].getNormalVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> nt (cloud_in[i].normal_x, cloud_in[i].normal_y, cloud_in[i].normal_z);
cloud_out[i].normal_x = static_cast<float> (transform (0, 0) * nt.coeffRef (0) + transform (0, 1) * nt.coeffRef (1) + transform (0, 2) * nt.coeffRef (2));
cloud_out[i].normal_y = static_cast<float> (transform (1, 0) * nt.coeffRef (0) + transform (1, 1) * nt.coeffRef (1) + transform (1, 2) * nt.coeffRef (2));
cloud_out[i].normal_z = static_cast<float> (transform (2, 0) * nt.coeffRef (0) + transform (2, 1) * nt.coeffRef (1) + transform (2, 2) * nt.coeffRef (2));
tf.se3 (cloud_in[i].data, cloud_out[i].data);
tf.so3 (cloud_in[i].data_n, cloud_out[i].data_n);
}
}
// Dataset might contain NaNs and Infs, so check for them first.
Expand All @@ -195,19 +343,8 @@ pcl::transformPointCloudWithNormals (const pcl::PointCloud<PointT> &cloud_in,
!pcl_isfinite (cloud_in.points[i].y) ||
!pcl_isfinite (cloud_in.points[i].z))
continue;

//cloud_out.points[i].getVector3fMap() = transform * cloud_in.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[i].x, cloud_in[i].y, cloud_in[i].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));

// Rotate normals
//cloud_out.points[i].getNormalVector3fMap() = transform.rotation () * cloud_in.points[i].getNormalVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> nt (cloud_in[i].normal_x, cloud_in[i].normal_y, cloud_in[i].normal_z);
cloud_out[i].normal_x = static_cast<float> (transform (0, 0) * nt.coeffRef (0) + transform (0, 1) * nt.coeffRef (1) + transform (0, 2) * nt.coeffRef (2));
cloud_out[i].normal_y = static_cast<float> (transform (1, 0) * nt.coeffRef (0) + transform (1, 1) * nt.coeffRef (1) + transform (1, 2) * nt.coeffRef (2));
cloud_out[i].normal_z = static_cast<float> (transform (2, 0) * nt.coeffRef (0) + transform (2, 1) * nt.coeffRef (1) + transform (2, 2) * nt.coeffRef (2));
tf.se3 (cloud_in[i].data, cloud_out[i].data);
tf.so3 (cloud_in[i].data_n, cloud_out[i].data_n);
}
}
}
Expand All @@ -230,6 +367,7 @@ pcl::transformPointCloudWithNormals (const pcl::PointCloud<PointT> &cloud_in,
cloud_out.sensor_orientation_ = cloud_in.sensor_orientation_;
cloud_out.sensor_origin_ = cloud_in.sensor_origin_;

pcl::detail::Transformer<Scalar> tf (transform.matrix ());
// If the data is dense, we don't need to check for NaN
if (cloud_in.is_dense)
{
Expand All @@ -238,18 +376,8 @@ pcl::transformPointCloudWithNormals (const pcl::PointCloud<PointT> &cloud_in,
// Copy fields first, then transform
if (copy_all_fields)
cloud_out.points[i] = cloud_in.points[indices[i]];
//cloud_out.points[i].getVector3fMap() = transform * cloud_in.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[indices[i]].x, cloud_in[indices[i]].y, cloud_in[indices[i]].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));

// Rotate normals
//cloud_out.points[i].getNormalVector3fMap() = transform.rotation () * cloud_in.points[i].getNormalVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> nt (cloud_in[indices[i]].normal_x, cloud_in[indices[i]].normal_y, cloud_in[indices[i]].normal_z);
cloud_out[i].normal_x = static_cast<float> (transform (0, 0) * nt.coeffRef (0) + transform (0, 1) * nt.coeffRef (1) + transform (0, 2) * nt.coeffRef (2));
cloud_out[i].normal_y = static_cast<float> (transform (1, 0) * nt.coeffRef (0) + transform (1, 1) * nt.coeffRef (1) + transform (1, 2) * nt.coeffRef (2));
cloud_out[i].normal_z = static_cast<float> (transform (2, 0) * nt.coeffRef (0) + transform (2, 1) * nt.coeffRef (1) + transform (2, 2) * nt.coeffRef (2));
tf.se3 (cloud_in[indices[i]].data, cloud_out[i].data);
tf.so3 (cloud_in[indices[i]].data_n, cloud_out[i].data_n);
}
}
// Dataset might contain NaNs and Infs, so check for them first.
Expand All @@ -266,18 +394,8 @@ pcl::transformPointCloudWithNormals (const pcl::PointCloud<PointT> &cloud_in,
!pcl_isfinite (cloud_in.points[indices[i]].z))
continue;

//cloud_out.points[i].getVector3fMap() = transform * cloud_in.points[i].getVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> pt (cloud_in[indices[i]].x, cloud_in[indices[i]].y, cloud_in[indices[i]].z);
cloud_out[i].x = static_cast<float> (transform (0, 0) * pt.coeffRef (0) + transform (0, 1) * pt.coeffRef (1) + transform (0, 2) * pt.coeffRef (2) + transform (0, 3));
cloud_out[i].y = static_cast<float> (transform (1, 0) * pt.coeffRef (0) + transform (1, 1) * pt.coeffRef (1) + transform (1, 2) * pt.coeffRef (2) + transform (1, 3));
cloud_out[i].z = static_cast<float> (transform (2, 0) * pt.coeffRef (0) + transform (2, 1) * pt.coeffRef (1) + transform (2, 2) * pt.coeffRef (2) + transform (2, 3));

// Rotate normals
//cloud_out.points[i].getNormalVector3fMap() = transform.rotation () * cloud_in.points[i].getNormalVector3fMap ();
Eigen::Matrix<Scalar, 3, 1> nt (cloud_in[indices[i]].normal_x, cloud_in[indices[i]].normal_y, cloud_in[indices[i]].normal_z);
cloud_out[i].normal_x = static_cast<float> (transform (0, 0) * nt.coeffRef (0) + transform (0, 1) * nt.coeffRef (1) + transform (0, 2) * nt.coeffRef (2));
cloud_out[i].normal_y = static_cast<float> (transform (1, 0) * nt.coeffRef (0) + transform (1, 1) * nt.coeffRef (1) + transform (1, 2) * nt.coeffRef (2));
cloud_out[i].normal_z = static_cast<float> (transform (2, 0) * nt.coeffRef (0) + transform (2, 1) * nt.coeffRef (1) + transform (2, 2) * nt.coeffRef (2));
tf.se3 (cloud_in[indices[i]].data, cloud_out[i].data);
tf.so3 (cloud_in[indices[i]].data_n, cloud_out[i].data_n);
}
}
}
Expand Down Expand Up @@ -316,8 +434,8 @@ pcl::transformPoint (const PointT &point,
const Eigen::Transform<Scalar, 3, Eigen::Affine> &transform)
{
PointT ret = point;
ret.getVector3fMap () = transform * point.getVector3fMap ();

pcl::detail::Transformer<Scalar> tf (transform.matrix ());
tf.se3 (point.data, ret.data);
return (ret);
}

Expand All @@ -327,9 +445,9 @@ pcl::transformPointWithNormal (const PointT &point,
const Eigen::Transform<Scalar, 3, Eigen::Affine> &transform)
{
PointT ret = point;
ret.getVector3fMap () = transform * point.getVector3fMap ();
ret.getNormalVector3fMap () = transform.rotation () * point.getNormalVector3fMap ();

pcl::detail::Transformer<Scalar> tf (transform.matrix ());
tf.se3 (point.data, ret.data);
tf.so3 (point.data_n, ret.data_n);
return (ret);
}

Expand Down

0 comments on commit 04ae840

Please sign in to comment.