From 52a829dad24f5e7431c2245c2fa0daa73d774904 Mon Sep 17 00:00:00 2001
From: Samuel Audet <samuel.audet@gmail.com>
Date: Thu, 31 Oct 2024 16:58:36 +0900
Subject: [PATCH]  * Upgrade presets for MKL 2025.0, LLVM 19.1.3, nvCOMP
 4.1.0.6, PyTorch 2.5.1, Triton Inference Server 2.51.0

---
 .github/actions/deploy-ubuntu/action.yml      |  14 +-
 .github/actions/deploy-windows/action.yml     |  18 +-
 .github/workflows/tritonserver.yml            |   2 +-
 CHANGELOG.md                                  |   2 +-
 README.md                                     |   6 +-
 cuda/README.md                                |   2 +-
 .../java/org/bytedeco/cuda/global/cublas.java |   2 +-
 .../org/bytedeco/cuda/global/cusolver.java    |   4 +-
 .../java/org/bytedeco/cuda/global/nvcomp.java | 755 ++++++++++++++----
 .../bytedeco/cuda/nvcomp/BitcompManager.java  |  16 +-
 .../cuda/nvcomp/DeflateFormatSpecHeader.java  |   1 +
 .../nvcomp/nvcompAlignmentRequirements_t.java |  45 ++
 ...s.java => nvcompBatchedBitcompOpts_t.java} |  24 +-
 .../nvcomp/nvcompBatchedCascadedOpts_t.java   |   3 +-
 .../nvcomp/nvcompBatchedDeflateOpts_t.java    |   1 +
 .../cuda/nvcomp/nvcompBatchedGzipOpts_t.java  |  38 +
 .../cuda/nvcomp/nvcompBatchedLZ4Opts_t.java   |   5 +
 .../cuda/nvcomp/nvcompCascadedFormatOpts.java |  52 --
 .../cuda/nvcomp/nvcompLZ4FormatOpts.java      |  49 --
 .../org/bytedeco/ffmpeg/global/postproc.java  |   2 +-
 gsl/README.md                                 |   2 +-
 gsl/samples/pom.xml                           |   2 +-
 leptonica/cppbuild.sh                         |   2 +-
 llvm/README.md                                |   4 +-
 llvm/cppbuild.sh                              |   2 +-
 llvm/platform/pom.xml                         |   2 +-
 llvm/pom.xml                                  |   3 +-
 llvm/samples/clang/pom.xml                    |   2 +-
 llvm/samples/llvm/pom.xml                     |   2 +-
 llvm/samples/polly/pom.xml                    |   6 +-
 .../llvm/LLVM/LLVMOpaqueDbgRecord.java        |  23 -
 mkl/README.md                                 |   6 +-
 mkl/platform/pom.xml                          |   2 +-
 mkl/platform/redist/pom.xml                   |   2 +-
 mkl/pom.xml                                   |   2 +-
 mkl/samples/pom.xml                           |   4 +-
 .../java/org/bytedeco/mkl/global/mkl_rt.java  | 217 ++++-
 numpy/README.md                               |   2 +-
 numpy/samples/pom.xml                         |   2 +-
 opencv/README.md                              |   2 +-
 opencv/samples/pom.xml                        |   2 +-
 platform/pom.xml                              |   8 +-
 pytorch/README.md                             |   8 +-
 pytorch/cppbuild.sh                           |   2 +-
 pytorch/platform/gpu/pom.xml                  |   2 +-
 pytorch/platform/pom.xml                      |   2 +-
 pytorch/pom.xml                               |   2 +-
 pytorch/samples/pom.xml                       |   6 +-
 scipy/README.md                               |   2 +-
 scipy/samples/pom.xml                         |   2 +-
 tritonserver/README.md                        |  10 +-
 tritonserver/cppbuild.sh                      |   4 +-
 tritonserver/platform/pom.xml                 |   2 +-
 tritonserver/platform/redist/pom.xml          |   2 +-
 tritonserver/pom.xml                          |   2 +-
 tritonserver/samples/simple/pom.xml           |   2 +-
 tritonserver/samples/simplecpp/pom.xml        |   2 +-
 tritonserver/samples/unsupported/pom.xml      |   2 +-
 tvm/README.md                                 |   2 +-
 tvm/cppbuild.sh                               |   2 +-
 tvm/platform/gpu/pom.xml                      |   2 +-
 tvm/platform/pom.xml                          |   2 +-
 tvm/pom.xml                                   |   4 +-
 tvm/samples/pom.xml                           |   2 +-
 64 files changed, 992 insertions(+), 414 deletions(-)
 create mode 100644 cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompAlignmentRequirements_t.java
 rename cuda/src/gen/java/org/bytedeco/cuda/nvcomp/{nvcompBatchedBitcompFormatOpts.java => nvcompBatchedBitcompOpts_t.java} (64%)
 create mode 100644 cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedGzipOpts_t.java
 delete mode 100644 cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompCascadedFormatOpts.java
 delete mode 100644 cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompLZ4FormatOpts.java
 delete mode 100644 llvm/src/gen/java/org/bytedeco/llvm/LLVM/LLVMOpaqueDbgRecord.java

diff --git a/.github/actions/deploy-ubuntu/action.yml b/.github/actions/deploy-ubuntu/action.yml
index 14b944e8f23..58334141cb3 100644
--- a/.github/actions/deploy-ubuntu/action.yml
+++ b/.github/actions/deploy-ubuntu/action.yml
@@ -44,7 +44,7 @@ runs:
           export CUDA=cuda-repo-rhel8-12-6-local-12.6.2_560.35.03-1.aarch64.rpm
           export CUDNN=cuda-12-9.5.1.17-1.aarch64
           export NCCL=2.23.4-1+cuda12.6.aarch64
-          export NVCOMP=nvcomp-linux-sbsa-4.0.1-cuda12.x
+          export NVCOMP=nvcomp-linux-sbsa-4.1.0.6_cuda12-archive
           export USERLAND_BUILDME="buildme --aarch64"
         elif [[ "$CI_DEPLOY_PLATFORM" == "linux-ppc64le" ]]; then
           export ARCH=ppc64el
@@ -66,7 +66,7 @@ runs:
           export CUDA=cuda-repo-rhel8-12-6-local-12.6.2_560.35.03-1.x86_64.rpm
           export CUDNN=cuda-12-9.5.1.17-1.x86_64
           export NCCL=2.23.4-1+cuda12.6.x86_64
-          export NVCOMP=nvcomp-linux-x86_64-4.0.1-cuda12.x
+          export NVCOMP=nvcomp-linux-x86_64-4.1.0.6_cuda12-archive
         fi
         echo "ARCH=$ARCH" >> $GITHUB_ENV
         echo "PREFIX=$PREFIX" >> $GITHUB_ENV
@@ -183,9 +183,9 @@ runs:
           for f in /usr/local/cuda/lib64/libcudnn*so.9.*; do $SUDO ln -sf $f ${f:0:${#f}-4}; $SUDO ln -sf $f ${f:0:${#f}-6}; done
 
           if [[ -n ${NVCOMP:-} ]]; then
-            curl -LO https://developer.download.nvidia.com/compute/nvcomp/4.0.1/local_installers/$NVCOMP.tar.gz
-            $SUDO tar -xvf $NVCOMP.tar.gz -C /usr/local/cuda/lib64/ --strip-components=1 lib/ || $SUDO tar -xvf $NVCOMP.tar.gz -C /usr/local/cuda/lib64/ --strip-components=2 nvcomp/lib/
-            $SUDO tar -xvf $NVCOMP.tar.gz -C /usr/local/cuda/include/ --strip-components=1 include/ || $SUDO tar -xvf $NVCOMP.tar.gz -C /usr/local/cuda/include/ --strip-components=2 nvcomp/include/
+            curl -LO https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/linux-$ARCH_CUDA/$NVCOMP.tar.xz
+            $SUDO tar -xvf $NVCOMP.tar.xz -C /usr/local/cuda/lib64/ --strip-components=2 */lib/
+            $SUDO tar -xvf $NVCOMP.tar.xz -C /usr/local/cuda/include/ --strip-components=2 */include/
             rm -f $NVCOMP.tar.gz
           fi
 
@@ -228,8 +228,8 @@ runs:
 
         if [[ "$CI_DEPLOY_MODULE" == "mkl" ]]; then
           echo Installing MKL
-          curl -LO https://registrationcenter-download.intel.com/akdlm/IRC_NAS/89a381f6-f85d-4dda-ae62-30d51470f53c/l_onemkl_p_2024.2.2.17_offline.sh
-          $SUDO bash l_onemkl_p_2024.2.2.17_offline.sh -s -a -s --eula accept
+          curl -LO https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh
+          $SUDO bash intel-onemkl-2025.0.0.940_offline.sh -s -a -s --eula accept
           export MAVEN_OPTIONS="-Djavacpp.platform.compiler=clang++"
         fi
 
diff --git a/.github/actions/deploy-windows/action.yml b/.github/actions/deploy-windows/action.yml
index 1869894efa2..f795b0b0175 100644
--- a/.github/actions/deploy-windows/action.yml
+++ b/.github/actions/deploy-windows/action.yml
@@ -102,7 +102,7 @@ runs:
           curl -LO https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.94_windows.exe
           curl -LO https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.5.1.17_cuda12-archive.zip
           curl -LO http://www.winimage.com/zLibDll/zlib123dllx64.zip
-          curl -LO https://developer.download.nvidia.com/compute/nvcomp/4.0.1/local_installers/nvcomp-windows-x86_64-4.0.1-cuda12.x.zip
+          curl -LO https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/windows-x86_64/nvcomp-windows-x86_64-4.1.0.6_cuda12-archive.zip
           cuda_11.8.0_522.06_windows.exe -s
           bash -c "rm -Rf 'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8'"
           bash -c "mv 'C:/Program Files/NVIDIA Corporation/NvToolsExt' 'C:/Program Files/NVIDIA Corporation/NvToolsExt_old'"
@@ -111,17 +111,15 @@ runs:
           bash -c "ls 'C:/Program Files/NVIDIA Corporation/NvToolsExt'"
           unzip cudnn-windows-x86_64-9.5.1.17_cuda12-archive.zip
           unzip zlib123dllx64.zip
-          unzip nvcomp-windows-x86_64-4.0.1-cuda12.x.zip
+          unzip nvcomp-windows-x86_64-4.1.0.6_cuda12-archive.zip
           move cudnn-windows-x86_64-9.5.1.17_cuda12-archive\bin\*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"
           move cudnn-windows-x86_64-9.5.1.17_cuda12-archive\include\*.h "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
           move cudnn-windows-x86_64-9.5.1.17_cuda12-archive\lib\x64\*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\lib\x64"
           move dll_x64\zlibwapi.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"
-          move nvcomp\include\* "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
-          move nvcomp\include\device "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
-          move nvcomp\include\native "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
-          move nvcomp\include\nvcomp "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
-          move nvcomp\lib\nvcomp*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"
-          move nvcomp\lib\nvcomp*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\lib\x64"
+          move nvcomp-windows-x86_64-4.1.0.6_cuda12-archive\include\* "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
+          move nvcomp-windows-x86_64-4.1.0.6_cuda12-archive\include\nvcomp "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\include"
+          move nvcomp-windows-x86_64-4.1.0.6_cuda12-archive\bin\nvcomp*.dll "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"
+          move nvcomp-windows-x86_64-4.1.0.6_cuda12-archive\bin\nvcomp*.lib "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.6\lib\x64"
 
           rem echo Applying hotfix to Visual Studio 2019 for CUDA
           rem curl -LO https://raw.githubusercontent.com/microsoft/STL/main/stl/inc/cmath
@@ -151,8 +149,8 @@ runs:
 
         if "%CI_DEPLOY_MODULE%"=="mkl" (
           echo Installing MKL
-          curl -LO https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9fe96489-78fe-4fea-8cc2-2ddf7de0246a/w_onemkl_p_2024.2.2.16_offline.exe
-          w_onemkl_p_2024.2.2.16_offline.exe -s -a -s --eula accept
+          curl -LO https://registrationcenter-download.intel.com/akdlm/IRC_NAS/e0a45889-f395-47d6-811d-0f3d8caae4a0/intel-onemkl-2025.0.0.929_offline.exe
+          intel-onemkl-2025.0.0.929_offline.exe -s -a -s --eula accept
         )
 
         if "%CI_DEPLOY_PLATFORM%"=="windows-x86" if "%CI_DEPLOY_MODULE%"=="flycapture" (
diff --git a/.github/workflows/tritonserver.yml b/.github/workflows/tritonserver.yml
index 9c1cfa0c286..6dbe2418a02 100644
--- a/.github/workflows/tritonserver.yml
+++ b/.github/workflows/tritonserver.yml
@@ -19,6 +19,6 @@ env:
 jobs:
   linux-x86_64:
     runs-on: ubuntu-20.04
-    container: nvcr.io/nvidia/tritonserver:24.09-tf2-python-py3
+    container: nvcr.io/nvidia/tritonserver:24.10-tf2-python-py3
     steps:
       - uses: bytedeco/javacpp-presets/.github/actions/deploy-ubuntu@actions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 78f7f723aa6..e70e3dc4004 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@
  * Build FFmpeg with zimg to enable zscale filter ([pull #1481](https://github.com/bytedeco/javacpp-presets/pull/1481))
  * Enable PulseAudio support for FFmpeg on Linux ([pull #1472](https://github.com/bytedeco/javacpp-presets/pull/1472))
  * Virtualize `btCollisionWorld`, `btOverlapFilterCallback`, `btOverlapCallback` from Bullet Physics SDK ([pull #1475](https://github.com/bytedeco/javacpp-presets/pull/1475))
- * Upgrade presets for OpenCV 4.10.0, FFmpeg 7.1, Spinnaker 4.0.0.116 ([pull #1524](https://github.com/bytedeco/javacpp-presets/pull/1524)), MKL 2024.2, DNNL 3.6, OpenBLAS 0.3.28, CMINPACK 1.3.11, GSL 2.8, CPython 3.13.0, NumPy 2.1.2, SciPy 1.14.1, LLVM 19.1.2, LibRaw 0.21.2 ([pull #1520](https://github.com/bytedeco/javacpp-presets/pull/1520)), Leptonica 1.85.0, Tesseract 5.4.1, libffi 3.4.6, CUDA 12.6.2, cuDNN 9.5.1, NCCL 2.23.4, nvCOMP 4.0.1, OpenCL 3.0.16, NVIDIA Video Codec SDK 12.2.72, PyTorch 2.5.0 ([pull #1466](https://github.com/bytedeco/javacpp-presets/pull/1466)), SentencePiece 0.2.0, TensorFlow Lite 2.18.0, TensorRT 10.5.0.18, Triton Inference Server 2.50.0, ONNX 1.17.0, ONNX Runtime 1.19.2, TVM 0.18.0, and their dependencies
+ * Upgrade presets for OpenCV 4.10.0, FFmpeg 7.1, Spinnaker 4.0.0.116 ([pull #1524](https://github.com/bytedeco/javacpp-presets/pull/1524)), MKL 2025.0, DNNL 3.6, OpenBLAS 0.3.28, CMINPACK 1.3.11, GSL 2.8, CPython 3.13.0, NumPy 2.1.2, SciPy 1.14.1, LLVM 19.1.3, LibRaw 0.21.2 ([pull #1520](https://github.com/bytedeco/javacpp-presets/pull/1520)), Leptonica 1.85.0, Tesseract 5.4.1, libffi 3.4.6, CUDA 12.6.2, cuDNN 9.5.1, NCCL 2.23.4, nvCOMP 4.1.0.6, OpenCL 3.0.16, NVIDIA Video Codec SDK 12.2.72, PyTorch 2.5.1 ([pull #1466](https://github.com/bytedeco/javacpp-presets/pull/1466)), SentencePiece 0.2.0, TensorFlow Lite 2.18.0, TensorRT 10.5.0.18, Triton Inference Server 2.51.0, ONNX 1.17.0, ONNX Runtime 1.19.2, TVM 0.18.0, and their dependencies
 
 ### January 29, 2024 version 1.5.10
  * Introduce `macosx-arm64` builds for PyTorch ([pull #1463](https://github.com/bytedeco/javacpp-presets/pull/1463))
diff --git a/README.md b/README.md
index 8f477741dc8..f6cc649e5f9 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * HDF5 1.14.x  https://www.hdfgroup.org/downloads/
  * Hyperscan 5.4.x  https://github.com/intel/hyperscan
  * LZ4 1.9.x  https://github.com/lz4/lz4
- * MKL 2024.x  https://software.intel.com/mkl
+ * MKL 2025.x  https://software.intel.com/mkl
  * MKL-DNN 0.21.x  https://github.com/oneapi-src/oneDNN
  * DNNL 3.6.x  https://github.com/oneapi-src/oneDNN
  * OpenBLAS 0.3.28  http://www.openblas.net/
@@ -219,7 +219,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * CUDA 12.6.x  https://developer.nvidia.com/cuda-downloads
    * cuDNN 9.5.x  https://developer.nvidia.com/cudnn
    * NCCL 2.23.x  https://developer.nvidia.com/nccl
-   * nvCOMP 4.0.x https://developer.nvidia.com/nvcomp
+   * nvCOMP 4.1.x https://developer.nvidia.com/nvcomp
  * NVIDIA Video Codec SDK 12.2.x  https://developer.nvidia.com/nvidia-video-codec-sdk
  * OpenCL 3.0.x  https://github.com/KhronosGroup/OpenCL-ICD-Loader
  * MXNet 1.9.x  https://github.com/apache/incubator-mxnet
@@ -228,7 +228,7 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * TensorFlow 1.15.x  https://github.com/tensorflow/tensorflow
  * TensorFlow Lite 2.18.x  https://github.com/tensorflow/tensorflow
  * TensorRT 10.5.x  https://developer.nvidia.com/tensorrt
- * Triton Inference Server 2.50.x  https://developer.nvidia.com/nvidia-triton-inference-server
+ * Triton Inference Server 2.51.x  https://developer.nvidia.com/nvidia-triton-inference-server
  * The Arcade Learning Environment 0.8.x  https://github.com/mgbellemare/Arcade-Learning-Environment
  * DepthAI 2.24.x  https://github.com/luxonis/depthai-core
  * ONNX 1.17.x  https://github.com/onnx/onnx
diff --git a/cuda/README.md b/cuda/README.md
index 598d815f7b7..cedef43f833 100644
--- a/cuda/README.md
+++ b/cuda/README.md
@@ -28,7 +28,7 @@ This directory contains the JavaCPP Presets module for:
  * CUDA 12.6.2  https://developer.nvidia.com/cuda-zone
  * cuDNN 9.5.1  https://developer.nvidia.com/cudnn
  * NCCL 2.23.4  https://developer.nvidia.com/nccl
- * nvCOMP 4.0.1  https://developer.nvidia.com/nvcomp
+ * nvCOMP 4.1.0.6  https://developer.nvidia.com/nvcomp
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java b/cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java
index 8faa8693ef9..9210cc96b78 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/global/cublas.java
@@ -15038,7 +15038,7 @@ public static native void cublasZtrmm(@Cast("char") byte side,
 // #include "driver_types.h"
 // #include "cuComplex.h" /* import complex data type */
 
-// #include "cublas_v2.h"
+// #include "cublas_api.h"
 
 // #if defined(__cplusplus)
 // Targeting ../cublas/cublasXtContext.java
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/global/cusolver.java b/cuda/src/gen/java/org/bytedeco/cuda/global/cusolver.java
index a6b20b65e17..d118d4f25dc 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/global/cusolver.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/global/cusolver.java
@@ -370,7 +370,7 @@ public class cusolver extends org.bytedeco.cuda.presets.cusolver {
 //   #include <stdio.h>
 
 //   #include "cuComplex.h" /* import complex data type */
-//   #include "cublas_v2.h"
+//   #include "cublas_api.h"
 //   #include "cusolver_common.h"
 
   /*******************************************************************************/
@@ -14391,7 +14391,7 @@ public class cusolver extends org.bytedeco.cuda.presets.cusolver {
 //   #define CUSOLVERSP_H_
 
 //   #include "cusparse.h"
-//   #include "cublas_v2.h"
+//   #include "cublas_api.h"
 //   #include "cusolver_common.h"
 
 //   #if defined(__cplusplus)
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/global/nvcomp.java b/cuda/src/gen/java/org/bytedeco/cuda/global/nvcomp.java
index 5a486259ac9..c59cbb5f301 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/global/nvcomp.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/global/nvcomp.java
@@ -31,6 +31,8 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 
 // #pragma once
 
+// #include <stddef.h>
+
 /** enum nvcompStatus_t */
 public static final int
   nvcompSuccess = 0,
@@ -45,6 +47,9 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
   nvcompErrorChunkSizeTooLarge = 18,
   nvcompErrorCudaError = 1000,
   nvcompErrorInternal = 10000;
+// Targeting ../nvcomp/nvcompAlignmentRequirements_t.java
+
+
 
 
 // Parsed from <nvcomp.h>
@@ -291,6 +296,7 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 // #include "cascaded.hpp"
 // #include "zstd.hpp"
 // #include "deflate.hpp"
+// #include "gzip.hpp"
 
 // #include <cassert>
 // #include <stdint.h>
@@ -371,7 +377,7 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /** enum nvcompANSDataType_t */
 public static final int
     uint8 = 0,
-    float16 = 1; // requires uncomp chunk size to be multiple of 2
+    float16 = 1;
 // Targeting ../nvcomp/nvcompBatchedANSOpts_t.java
 
 
@@ -381,12 +387,25 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompANSCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompANSRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedANSCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedANSOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -446,13 +465,16 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous compression.
  *
- * The caller is responsible for passing device_compressed_chunk_bytes of size
- * sufficient to hold compressed data
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
  * should reside in device-accessible memory.
- * Each pointer must be aligned to an 8-byte boundary.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedANSCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
@@ -460,6 +482,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] The temporary GPU workspace, could be NULL in case
  * temporary memory is not needed.
+ * Must be aligned to the value in the {@code temp} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedANSCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
  * {@code device_temp_ptr}.
  * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
@@ -467,7 +493,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedANSCompressGetMaxOutputChunkSize}.
- * Each pointer must be aligned to an 8-byte boundary.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedANSCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -499,6 +528,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedANSOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedANSDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -535,15 +569,20 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * \brief Asynchronously compute the number of bytes of uncompressed data for
  * each compressed chunk.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedANSDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
  * to be filled with the sizes, in bytes, of each uncompressed data chunk.
  * If there is an error when retrieving the size of a chunk, the
  * uncompressed size of that chunk will be set to 0. This argument needs to
- * be prealloated in device-accessible memory.
+ * be preallocated in device-accessible memory.
  * @param num_chunks [in] Number of data chunks to compute sizes of.
  * @param stream [in] The CUDA stream to operate on.
  *
@@ -565,13 +604,16 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * NOTE: This function is used to decompress compressed buffers produced by
+ * This function is used to decompress compressed buffers produced by
  * {@code nvcompBatchedANSCompressAsync}.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory and start at a location with
- * 8-byte alignment.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedANSDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -585,12 +627,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * This argument needs to be preallocated.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space, could be NULL in case temporary space is not needed.
+ * Must be aligned to the value in {@code nvcompBatchedANSDecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes,
- * and start at a location with 8-byte alignment.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedANSDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -704,21 +748,36 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 // #include <stdint.h>
 
 // #ifdef __cplusplus
-// Targeting ../nvcomp/nvcompBatchedBitcompFormatOpts.java
+// Targeting ../nvcomp/nvcompBatchedBitcompOpts_t.java
+
 
 
+/** Legacy alias for \ref nvcompBatchedBitcompOpts_t. */
 
-@MemberGetter public static native @Const @ByRef nvcompBatchedBitcompFormatOpts nvcompBatchedBitcompDefaultOpts();
+@MemberGetter public static native @Const @ByRef nvcompBatchedBitcompOpts_t nvcompBatchedBitcompDefaultOpts();
 
 @MemberGetter public static native @Cast("const size_t") long nvcompBitcompCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompBitcompRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedBitcompCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedBitcompOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -726,7 +785,8 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  *
  * @param num_chunks [in] The number of chunks of memory in the batch.
  * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
- * batch.
+ * batch. This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param format_opts [in] Compression options.
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily
  * required during compression.
@@ -736,7 +796,7 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 public static native @Cast("nvcompStatus_t") int nvcompBatchedBitcompCompressGetTempSize(
     @Cast("size_t") long num_chunks,
     @Cast("size_t") long max_uncompressed_chunk_bytes,
-    @ByVal nvcompBatchedBitcompFormatOpts format_opts,
+    @ByVal nvcompBatchedBitcompOpts_t format_opts,
     @Cast("size_t*") SizeTPointer temp_bytes);
 
 /**
@@ -746,8 +806,9 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * NOTE: Bitcomp currently doesn't use any temp memory.
  *
  * @param num_chunks [in] The number of chunks of memory in the batch.
- * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
- * batch.
+ * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk 
+ * in the batch. This parameter is currently unused. Set it to either
+ * the actual value or zero.
  * @param format_opts [in] Compression options.
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily
  * required during compression.
@@ -759,7 +820,7 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 public static native @Cast("nvcompStatus_t") int nvcompBatchedBitcompCompressGetTempSizeEx(
     @Cast("size_t") long num_chunks,
     @Cast("size_t") long max_uncompressed_chunk_bytes,
-    @ByVal nvcompBatchedBitcompFormatOpts format_opts,
+    @ByVal nvcompBatchedBitcompOpts_t format_opts,
     @Cast("size_t*") SizeTPointer temp_bytes,
     @Cast("const size_t") long max_total_uncompressed_bytes);
 
@@ -776,24 +837,30 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  */
 public static native @Cast("nvcompStatus_t") int nvcompBatchedBitcompCompressGetMaxOutputChunkSize(
     @Cast("size_t") long max_uncompressed_chunk_bytes,
-    @ByVal nvcompBatchedBitcompFormatOpts format_opts,
+    @ByVal nvcompBatchedBitcompOpts_t format_opts,
     @Cast("size_t*") SizeTPointer max_compressed_chunk_bytes);
 
 /**
  * \brief Perform batched asynchronous compression.
  *
- * NOTE: The maximum number of chunks allowed is 2^31.
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
- * should reside in device-accessible memory. The uncompressed data must start
- * at locations with alignments of the data type size.
+ * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedBitcompCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
- * Each chunk size MUST be a multiple of the size of the data type specified by
- * format_opts.data_type, else this may crash or produce invalid output.
- * @param max_uncompressed_chunk_bytes [in] This argument is not used.
+ * Each chunk size must be a multiple of the size of the data type specified by
+ * format_opts.data_type.
+ * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
+ * batch. This parameter is currently unused. 
+ * Set it to either the actual value or zero.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] This argument is not used.
  * @param temp_bytes [in] This argument is not used.
@@ -802,7 +869,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedBitcompCompressGetMaxOutputChunkSize}.
- * Each compressed buffer should start at a location with 8-byte alignment.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedBitcompCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -820,7 +890,7 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @Cast("size_t") long temp_bytes,
     @Cast("void*const*") PointerPointer device_compressed_chunk_ptrs,
     @Cast("size_t*") SizeTPointer device_compressed_chunk_bytes,
-    @ByVal nvcompBatchedBitcompFormatOpts format_opts,
+    @ByVal nvcompBatchedBitcompOpts_t format_opts,
     CUstream_st stream);
 public static native @Cast("nvcompStatus_t") int nvcompBatchedBitcompCompressAsync(
     @Cast("const void*const*") @ByPtrPtr Pointer device_uncompressed_chunk_ptrs,
@@ -831,9 +901,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @Cast("size_t") long temp_bytes,
     @Cast("void*const*") @ByPtrPtr Pointer device_compressed_chunk_ptrs,
     @Cast("size_t*") SizeTPointer device_compressed_chunk_bytes,
-    @ByVal nvcompBatchedBitcompFormatOpts format_opts,
+    @ByVal nvcompBatchedBitcompOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedBitcompDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -871,14 +946,19 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * \brief Asynchronously compute the number of bytes of uncompressed data for
  * each compressed chunk.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedBitcompDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] This argument is not used.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
  * to be filled with the sizes, in bytes, of each uncompressed data chunk.
  * If there is an error when retrieving the size of a chunk, the
  * uncompressed size of that chunk will be set to 0. This argument needs to
- * be prealloated in device-accessible memory.
+ * be preallocated in device-accessible memory.
  * @param num_chunks [in] Number of data chunks to compute sizes of.
  * @param stream [in] The CUDA stream to operate on.
  *
@@ -900,19 +980,25 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * NOTE: This function is used to decompress compressed buffers produced by
+ * This function is used to decompress compressed buffers produced by
  * {@code nvcompBatchedBitcompCompressAsync}. It can also decompress buffers
  * compressed with the standalone Bitcomp library.
  * 
- * NOTE: The function is not completely asynchronous, as it needs to look
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
+ * \note The function is not completely asynchronous, as it needs to look
  * at the compressed data in order to create the proper bitcomp handle.
  * The stream is synchronized, the data is examined, then the asynchronous
  * decompression is launched.
+ * 
+ * \note An asynchronous, faster version of batched Bitcomp asynchrnous decompression
+ * is available, and can be launched via the HLIF manager.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory and start at a location with
- * 8-byte alignment.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedBitcompDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] This argument is not used.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
  * in bytes, of the output buffers to be filled with uncompressed data for each chunk.
@@ -929,7 +1015,9 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedBitcompDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -1041,9 +1129,6 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 // #include <stdint.h>
 
 // #ifdef __cplusplus
-// Targeting ../nvcomp/nvcompCascadedFormatOpts.java
-
-
 // Targeting ../nvcomp/nvcompBatchedCascadedOpts_t.java
 
 
@@ -1054,12 +1139,25 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompCascadedCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompCascadedRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedCascadedCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedCascadedOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -1068,7 +1166,8 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  *
  * @param num_chunks [in] The number of chunks of memory in the batch.
  * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
- * batch.
+ * batch. This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param format_opts [in] The Cascaded compression options and datatype to use.
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily
  * required during compression.
@@ -1090,7 +1189,8 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  *
  * @param num_chunks [in] The number of chunks of memory in the batch.
  * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
- * batch.
+ * batch. This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param format_opts [in] The Cascaded compression options and datatype to use.
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily
  * required during compression.
@@ -1128,16 +1228,24 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * \note The current implementation does not support uncompressed size larger
  * than 4,294,967,295 bytes (max uint32_t).
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
- * should reside in device-accessible memory. The uncompressed data must start
- * at locations with alignments of the data type size.
+ * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedCascadedCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
- * Each chunk size MUST be a multiple of the size of the data type specified by
+ * Each chunk size must be a multiple of the size of the data type specified by
  * format_opts.type, else this may crash or produce invalid output.
- * @param max_uncompressed_chunk_bytes [in] This argument is not used.
+ * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
+ * This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] This argument is not used.
  * @param temp_bytes [in] This argument is not used.
@@ -1145,9 +1253,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * to the output compressed buffers. Both the pointers and the compressed
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
- * {@code nvcompBatchedCascadedCompressGetMaxOutputChunkSize}. Each
- * compressed buffer should start at a location with alignment of both 4B and
- * the data type.
+ * {@code nvcompBatchedCascadedCompressGetMaxOutputChunkSize}.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedCascadedCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -1179,6 +1289,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedCascadedOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedCascadedDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -1215,15 +1330,20 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * \brief Asynchronously compute the number of bytes of uncompressed data for
  * each compressed chunk.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedCascadedDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
  * to be filled with the sizes, in bytes, of each uncompressed data chunk.
  * If there is an error when retrieving the size of a chunk, the
  * uncompressed size of that chunk will be set to 0. This argument needs to
- * be prealloated in device-accessible memory.
+ * be preallocated in device-accessible memory.
  * @param num_chunks [in] Number of data chunks to compute sizes of.
  * @param stream [in] The CUDA stream to operate on.
  *
@@ -1245,13 +1365,16 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * \note This function is used to decompress compressed buffers produced by
+ * This function is used to decompress compressed buffers produced by
  * {@code nvcompBatchedCascadedCompressAsync}.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory and start at a location with
- * alignment of both 4B and the data type.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedCascadedDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -1269,8 +1392,9 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes,
- * and start at a location with alignment of the data type.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedCascadedDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -1325,7 +1449,7 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @Cast("void*const*") @ByPtrPtr Pointer device_uncompressed_chunk_ptrs,
     @Cast("nvcompStatus_t*") int[] device_statuses,
     CUstream_st stream);
-
+    
 // #ifdef __cplusplus
 // #endif
 
@@ -1440,12 +1564,25 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompDeflateCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompDeflateRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedDeflateCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedDeflateOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -1517,19 +1654,28 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous compression.
  *
- * The individual chunk size must not exceed
- * 65536 bytes. For best performance, a chunk size of 65536 bytes is
- * recommended. The output buffers must be 8-byte aligned.
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
  * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedDeflateCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
+ * Chunk sizes must not exceed 65536 bytes. For best performance, a chunk size
+ * of 65536 bytes is recommended.
  * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] The temporary GPU workspace.
+ * Must be aligned to the value in the {@code temp} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedDeflateCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
  * {@code device_temp_ptr}.
  * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
@@ -1537,6 +1683,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedDeflateCompressGetMaxOutputChunkSize}.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedDeflateCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -1568,6 +1718,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedDeflateOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedDeflateDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -1605,10 +1760,16 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * each compressed chunk.
  *
  * This is needed when we do not know the expected output size.
- * NOTE: If the stream is corrupt, the sizes will be garbage.
+ *
+ * \note If the stream is corrupt, the calculated sizes will be invalid.
+ *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedDeflateDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
@@ -1634,13 +1795,17 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * In the case where a chunk of compressed data is not a valid Deflate
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
+ * \note In the case where a chunk of compressed data is not a valid Deflate
  * stream, 0 will be written for the size of the invalid chunk and
  * nvcompStatusCannotDecompress will be flagged for that chunk.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedDeflateDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -1655,11 +1820,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * in which case the actual sizes are not reported.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space.
+ * Must be aligned to the value in {@code nvcompBatchedDeflateDecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedDeflateDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -1781,12 +1949,25 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompGdeflateCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompGdeflateRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedGdeflateCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedGdeflateOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -1858,19 +2039,28 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous compression.
  *
- * The individual chunk size must not exceed
- * 65536 bytes. For best performance, a chunk size of 65536 bytes is
- * recommended. The output buffers must be 8-byte aligned.
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
  * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedGdeflateCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
+ * Chunk sizes must not exceed 65536 bytes. For best performance, a chunk size
+ * of 65536 bytes is recommended.
  * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] The temporary GPU workspace.
+ * Must be aligned to the value in the {@code temp} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedGdeflateCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
  * {@code device_temp_ptr}.
  * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
@@ -1878,6 +2068,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedGdeflateCompressGetMaxOutputChunkSize}.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedGdeflateCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -1909,6 +2103,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedGdeflateOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedGdeflateDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -1946,10 +2145,16 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * each compressed chunk.
  *
  * This is needed when we do not know the expected output size.
- * NOTE: If the stream is corrupt, the sizes will be garbage.
+ *
+ * \note If the stream is corrupt, the calculated sizes will be invalid.
+ *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedGdeflateDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
@@ -1975,13 +2180,17 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * In the case where a chunk of compressed data is not a valid GDeflate
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
+ * \note In the case where a chunk of compressed data is not a valid GDeflate
  * stream, 0 will be written for the size of the invalid chunk and
  * nvcompStatusCannotDecompress will be flagged for that chunk.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedGdeflateDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -1996,11 +2205,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * in which case the actual sizes are not reported.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space.
+ * Must be aligned to the value in {@code nvcompBatchedGdeflateDecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedGdeflateDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -2122,6 +2334,138 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * Batched decompression interface for gzip
  *****************************************************************************/
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedGzipDecompressRequiredAlignments();
+// Targeting ../nvcomp/nvcompBatchedGzipOpts_t.java
+
+
+
+@MemberGetter public static native @Const @ByRef nvcompBatchedGzipOpts_t nvcompBatchedGzipDefaultOpts();
+
+/**
+ * \brief Get the amount of temporary memory required on the GPU for compression.
+ *
+ * Chunk size must not exceed
+ * 65536 bytes. For best performance, a chunk size of 65536 bytes is
+ * recommended.
+ *
+ * @param num_chunks [in] The number of chunks of memory in the batch.
+ * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
+ * batch.
+ * @param format_opts [in] The Gzip compression options to use.
+ * @param temp_bytes [out] The amount of GPU memory that will be temporarily
+ * required during compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedGzipCompressGetTempSize(
+    @Cast("size_t") long num_chunks,
+    @Cast("size_t") long max_uncompressed_chunk_bytes,
+    @ByVal nvcompBatchedGzipOpts_t format_opts,
+    @Cast("size_t*") SizeTPointer temp_bytes);
+
+/**
+ * \brief Get the amount of temporary memory required on the GPU for compression
+ * with extra total bytes argument.
+ *
+ * Chunk size must not exceed
+ * 65536 bytes. For best performance, a chunk size of 65536 bytes is
+ * recommended.
+ *
+ * @param num_chunks [in] The number of chunks of memory in the batch.
+ * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
+ * batch.
+ * @param format_opts [in] The Gzip compression options to use.
+ * @param temp_bytes [out] The amount of GPU memory that will be temporarily
+ * required during compression.
+ * @param max_total_uncompressed_bytes [in] Upper bound on the total uncompressed
+ * size of all chunks
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedGzipCompressGetTempSizeEx(
+    @Cast("size_t") long num_chunks,
+    @Cast("size_t") long max_uncompressed_chunk_bytes,
+    @ByVal nvcompBatchedGzipOpts_t format_opts,
+    @Cast("size_t*") SizeTPointer temp_bytes,
+    @Cast("const size_t") long max_total_uncompressed_bytes);
+
+/**
+ * \brief Get the maximum size that a chunk of size at most max_uncompressed_chunk_bytes
+ * could compress to. That is, the minimum amount of output memory required to be given
+ * nvcompBatchedGzipCompressAsync() for each chunk.
+ *
+ * Chunk size must not exceed
+ * 65536 bytes. For best performance, a chunk size of 65536 bytes is
+ * recommended.
+ *
+ * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk before compression.
+ * @param format_opts [in] The Gzip compression options to use.
+ * @param max_compressed_chunk_bytes [out] The maximum possible compressed size of the chunk.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedGzipCompressGetMaxOutputChunkSize(
+    @Cast("size_t") long max_uncompressed_chunk_bytes,
+    @ByVal nvcompBatchedGzipOpts_t format_opts,
+    @Cast("size_t*") SizeTPointer max_compressed_chunk_bytes);
+
+/**
+ * \brief Perform batched asynchronous compression.
+ *
+ * The individual chunk size must not exceed
+ * 65536 bytes. For best performance, a chunk size of 65536 bytes is
+ * recommended. The output buffers must be 8-byte aligned.
+ *
+ * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
+ * to the uncompressed data chunks. Both the pointers and the uncompressed data
+ * should reside in device-accessible memory.
+ * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
+ * sizes of the uncompressed chunks in bytes.
+ * The sizes should reside in device-accessible memory.
+ * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
+ * @param num_chunks [in] Number of chunks of data to compress.
+ * @param device_temp_ptr [in] The temporary GPU workspace.
+ * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
+ * {@code device_temp_ptr}.
+ * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
+ * to the output compressed buffers. Both the pointers and the compressed
+ * buffers should reside in device-accessible memory. Each compressed buffer
+ * should be preallocated with the size given by
+ * {@code nvcompBatchedGzipCompressGetMaxOutputChunkSize}.
+ * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks,
+ * to be filled with the compressed sizes of each chunk.
+ * The buffer should be preallocated in device-accessible memory.
+ * @param format_opts [in] The Gzip compression options to use.
+ * @param stream [in] The CUDA stream to operate on.
+ *
+ * @return nvcompSuccess if successfully launched, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedGzipCompressAsync(
+    @Cast("const void*const*") PointerPointer device_uncompressed_chunk_ptrs,
+    @Cast("const size_t*") SizeTPointer device_uncompressed_chunk_bytes,
+    @Cast("size_t") long max_uncompressed_chunk_bytes,
+    @Cast("size_t") long num_chunks,
+    Pointer device_temp_ptr,
+    @Cast("size_t") long temp_bytes,
+    @Cast("void*const*") PointerPointer device_compressed_chunk_ptrs,
+    @Cast("size_t*") SizeTPointer device_compressed_chunk_bytes,
+    @ByVal nvcompBatchedGzipOpts_t format_opts,
+    CUstream_st stream);
+public static native @Cast("nvcompStatus_t") int nvcompBatchedGzipCompressAsync(
+    @Cast("const void*const*") @ByPtrPtr Pointer device_uncompressed_chunk_ptrs,
+    @Cast("const size_t*") SizeTPointer device_uncompressed_chunk_bytes,
+    @Cast("size_t") long max_uncompressed_chunk_bytes,
+    @Cast("size_t") long num_chunks,
+    Pointer device_temp_ptr,
+    @Cast("size_t") long temp_bytes,
+    @Cast("void*const*") @ByPtrPtr Pointer device_compressed_chunk_ptrs,
+    @Cast("size_t*") SizeTPointer device_compressed_chunk_bytes,
+    @ByVal nvcompBatchedGzipOpts_t format_opts,
+    CUstream_st stream);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -2159,10 +2503,16 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * each compressed chunk.
  *
  * This is needed when we do not know the expected output size.
- * NOTE: If the stream is corrupt, the sizes will be garbage.
+ *
+ * \note If the stream is corrupt, the calculated sizes will be invalid.
+ *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedGzipDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
@@ -2188,13 +2538,17 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * In the case where a chunk of compressed data is not a valid gzip
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
+ * \note In the case where a chunk of compressed data is not a valid Deflate
  * stream, 0 will be written for the size of the invalid chunk and
  * nvcompStatusCannotDecompress will be flagged for that chunk.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedGzipDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -2209,11 +2563,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * in which case the actual sizes are not reported.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space.
+ * Must be aligned to the value in {@code nvcompBatchedGzipDecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedGzipDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -2299,9 +2656,6 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 // #include <stdint.h>
 
 // #ifdef __cplusplus
-// Targeting ../nvcomp/nvcompLZ4FormatOpts.java
-
-
 // Targeting ../nvcomp/nvcompBatchedLZ4Opts_t.java
 
 
@@ -2311,9 +2665,9 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompLZ4CompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompLZ4RequiredAlignment();
 
@@ -2321,6 +2675,19 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * Batched compression/decompression interface
  *****************************************************************************/
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedLZ4CompressGetRequiredAlignments(
+    @ByVal nvcompBatchedLZ4Opts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -2389,23 +2756,30 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous compression.
  *
- * The individual chunk size must not exceed 16777216 bytes.
- * For best performance, a chunk size of 65536 bytes is recommended.
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
  * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedLZ4CompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
- * Each chunk size MUST be a multiple of the size of the data type specified by
- * format_opts.data_type, else this may crash or produce invalid output.
+ * Each chunk size must be a multiple of the size of the data type specified by
+ * format_opts.data_type.
+ * Chunk sizes must not exceed 16777216 bytes. For best performance, a chunk size
+ * of 65536 bytes is recommended.
  * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
- * This parameter is currently unused, so if it is not set
- * with the maximum size, it should be set to zero. If a future version makes
- * use of it, it will return an error if it is set to zero.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] The temporary GPU workspace.
+ * Must be aligned to the value in the {@code temp} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedLZ4CompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
  * {@code device_temp_ptr}.
  * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
@@ -2413,6 +2787,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedLZ4CompressGetMaxOutputChunkSize}.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedLZ4CompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -2444,6 +2822,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedLZ4Opts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedLZ4DecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -2481,15 +2864,21 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * each compressed chunk.
  *
  * This is needed when we do not know the expected output size.
- * NOTE: If the stream is corrupt, the sizes will be garbage.
+ *
+ * \note If the stream is corrupt, the calculated sizes will be invalid.
+ *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedLZ4DecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
  * to be filled with the sizes, in bytes, of each uncompressed data chunk.
- * This argument needs to be prealloated in device-accessible memory.
+ * This argument needs to be preallocated in device-accessible memory.
  * @param num_chunks [in] Number of data chunks to compute sizes of.
  * @param stream [in] The CUDA stream to operate on.
  *
@@ -2511,13 +2900,17 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
- * In the case where a chunk of compressed data is not a valid LZ4
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
+ * \note In the case where a chunk of compressed data is not a valid LZ4
  * block, 0 will be written for the size of the invalid chunk and
  * nvcompStatusCannotDecompress will be flagged for that chunk.
  *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedLZ4DecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -2532,11 +2925,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * in which case the actual sizes are not reported.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space.
+ * Must be aligned to the value in {@code nvcompBatchedLZ4DecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedLZ4DecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -2660,20 +3056,32 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompSnappyCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
- *
- * The Snappy compressor supports unaligned data, so this value is 1.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompSnappyRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedSnappyCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedSnappyOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
  * @param num_chunks [in] The number of chunks of memory in the batch.
  * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
- * batch.
+ * batch. This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param format_opts [in] Snappy compression options.
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily
  * required during compression.
@@ -2692,7 +3100,8 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  *
  * @param num_chunks [in] The number of chunks of memory in the batch.
  * @param max_uncompressed_chunk_bytes [in] The maximum size of a chunk in the
- * batch.
+ * batch. This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param format_opts [in] Snappy compression options.
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily
  * required during compression.
@@ -2727,19 +3136,29 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous compression.
  *
- * The caller is responsible for passing device_compressed_chunk_bytes of size
- * sufficient to hold compressed data
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
  * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedSnappyCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
  * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
+ * This parameter is currently unused. Set it to either the actual value 
+ * or zero.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] The temporary GPU workspace, could be NULL in case
  * temporary memory is not needed.
+ * Must be aligned to the value in the {@code temp} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedSnappyCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
  * {@code device_temp_ptr}.
  * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
@@ -2747,6 +3166,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedSnappyCompressGetMaxOutputChunkSize}.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedSnappyCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -2778,12 +3201,17 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedSnappyOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedSnappyDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
  * @param num_chunks [in] Number of chunks of data to be decompressed.
  * @param max_uncompressed_chunk_bytes [in] The size of the largest chunk in bytes
- * when uncompressed.
+ * when uncompressed. 
  * @param temp_bytes [out] The amount of GPU memory that will be temporarily required
  * during decompression.
  *
@@ -2814,15 +3242,20 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * \brief Asynchronously compute the number of bytes of uncompressed data for
  * each compressed chunk.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedSnappyDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
  * to be filled with the sizes, in bytes, of each uncompressed data chunk.
  * If there is an error when retrieving the size of a chunk, the
  * uncompressed size of that chunk will be set to 0. This argument needs to
- * be prealloated in device-accessible memory.
+ * be preallocated in device-accessible memory.
  * @param num_chunks [in] Number of data chunks to compute sizes of.
  * @param stream [in] The CUDA stream to operate on.
  *
@@ -2844,9 +3277,13 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedSnappyDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -2861,11 +3298,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * in which case the actual sizes are not reported.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space, could be NULL in case temporary space is not needed.
+ * Must be aligned to the value in {@code nvcompBatchedSnappyDecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedSnappyDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
@@ -2991,12 +3431,25 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 @MemberGetter public static native @Cast("const size_t") long nvcompZstdCompressionMaxAllowedChunkSize();
 
 /**
- * This is the minimum alignment required for void type CUDA memory buffers
- * passed to compression or decompression functions.  Typed memory buffers must
- * still be aligned to their type's size, e.g. 8 bytes for size_t.
+ * The most restrictive of minimum alignment requirements for void-type CUDA memory buffers
+ * used for input, output, or temporary memory, passed to compression or decompression functions.
+ * In all cases, typed memory buffers must still be aligned to their type's size, e.g., 4 bytes for {@code int}.
  */
 @MemberGetter public static native @Cast("const size_t") long nvcompZstdRequiredAlignment();
 
+/**
+ * \brief Get the minimum buffer alignment requirements for compression.
+ *
+ * @param format_opts [in] Compression options.
+ * @param alignment_requirements [out] The minimum buffer alignment requirements
+ * for compression.
+ *
+ * @return nvcompSuccess if successful, and an error code otherwise.
+ */
+public static native @Cast("nvcompStatus_t") int nvcompBatchedZstdCompressGetRequiredAlignments(
+    @ByVal nvcompBatchedZstdOpts_t format_opts,
+    nvcompAlignmentRequirements_t alignment_requirements);
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for compression.
  *
@@ -3069,22 +3522,29 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous compression.
  *
- * The individual chunk size must not exceed 16 MB.
- * For best performance, a chunk size of 64 KB is recommended.
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
  *
  * @param device_uncompressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
  * to the uncompressed data chunks. Both the pointers and the uncompressed data
  * should reside in device-accessible memory.
+ * Each chunk must be aligned to the value in the {@code input} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedZstdCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_uncompressed_chunk_bytes [in] Array with size \p num_chunks of
  * sizes of the uncompressed chunks in bytes.
  * The sizes should reside in device-accessible memory.
+ * Chunk sizes must not exceed 16 MB. For best performance, a chunk size of
+ * 64 KB is recommended.
  * @param max_uncompressed_chunk_bytes [in] The size of the largest uncompressed chunk.
- * This parameter is currently unused, so if it is not set
- * with the maximum size, it should be set to zero. If a future version makes
- * use of it, it will return an error if it is set to zero.
  * @param num_chunks [in] Number of chunks of data to compress.
  * @param device_temp_ptr [in] The temporary GPU workspace, could be NULL in case
  * temporary memory is not needed.
+ * Must be aligned to the value in the {@code temp} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedZstdCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param temp_bytes [in] The size of the temporary GPU memory pointed to by
  * {@code device_temp_ptr}.
  * @param device_compressed_chunk_ptrs [out] Array with size \p num_chunks of pointers
@@ -3092,6 +3552,10 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * buffers should reside in device-accessible memory. Each compressed buffer
  * should be preallocated with the size given by
  * {@code nvcompBatchedZstdCompressGetMaxOutputChunkSize}.
+ * Each compressed buffer must be aligned to the value in the {@code output} member of the
+ * \ref nvcompAlignmentRequirements_t object output by
+ * {@code nvcompBatchedZstdCompressGetRequiredAlignments} when called with the same
+ * \p format_opts.
  * @param device_compressed_chunk_bytes [out] Array with size \p num_chunks, 
  * to be filled with the compressed sizes of each chunk.
  * The buffer should be preallocated in device-accessible memory.
@@ -3123,6 +3587,11 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
     @ByVal nvcompBatchedZstdOpts_t format_opts,
     CUstream_st stream);
 
+/**
+ * Minimum buffer alignment requirements for decompression.
+ */
+@MemberGetter public static native @Const @ByRef nvcompAlignmentRequirements_t nvcompBatchedZstdDecompressRequiredAlignments();
+
 /**
  * \brief Get the amount of temporary memory required on the GPU for decompression.
  *
@@ -3158,15 +3627,20 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * \brief Asynchronously compute the number of bytes of uncompressed data for
  * each compressed chunk.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of
  * pointers in device-accessible memory to compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedZstdDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes
  * of the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_chunk_bytes [out] Array with size \p num_chunks
  * to be filled with the sizes, in bytes, of each uncompressed data chunk.
  * If there is an error when retrieving the size of a chunk, the
  * uncompressed size of that chunk will be set to 0. This argument needs to
- * be prealloated in device-accessible memory.
+ * be preallocated in device-accessible memory.
  * @param num_chunks [in] Number of data chunks to compute sizes of.
  * @param stream [in] The CUDA stream to operate on.
  *
@@ -3188,9 +3662,13 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
 /**
  * \brief Perform batched asynchronous decompression.
  *
+ * \note Violating any of the conditions listed in the parameter descriptions
+ * below may result in undefined behaviour.
+ *
  * @param device_compressed_chunk_ptrs [in] Array with size \p num_chunks of pointers
- * in device-accessible memory to compressed buffers. Each compressed buffer
- * should reside in device-accessible memory.
+ * in device-accessible memory to device-accessible compressed buffers.
+ * Each buffer must be aligned to the value in
+ * {@code nvcompBatchedZstdDecompressRequiredAlignments.input}.
  * @param device_compressed_chunk_bytes [in] Array with size \p num_chunks of sizes of
  * the compressed buffers in bytes. The sizes should reside in device-accessible memory.
  * @param device_uncompressed_buffer_bytes [in] Array with size \p num_chunks of sizes,
@@ -3203,11 +3681,14 @@ public class nvcomp extends org.bytedeco.cuda.presets.nvcomp {
  * be filled with the actual number of bytes decompressed for every chunk.
  * @param num_chunks [in] Number of chunks of data to decompress.
  * @param device_temp_ptr [in] The temporary GPU space, could be NULL in case temporary space is not needed.
+ * Must be aligned to the value in {@code nvcompBatchedZstdDecompressRequiredAlignments.temp}.
  * @param temp_bytes [in] The size of the temporary GPU space.
  * @param device_uncompressed_chunk_ptrs [out] Array with size \p num_chunks of
  * pointers in device-accessible memory to decompressed data. Each uncompressed
  * buffer needs to be preallocated in device-accessible memory, have the size
- * specified by the corresponding entry in device_uncompressed_buffer_bytes.
+ * specified by the corresponding entry in \p device_uncompressed_buffer_bytes,
+ * and be aligned to the value in
+ * {@code nvcompBatchedZstdDecompressRequiredAlignments.output}.
  * @param device_statuses [out] Array with size \p num_chunks of statuses in
  * device-accessible memory. This argument needs to be preallocated. For each
  * chunk, if the decompression is successful, the status will be set to
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/BitcompManager.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/BitcompManager.java
index 3d9560a719f..9d4d2f94c28 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/BitcompManager.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/BitcompManager.java
@@ -30,32 +30,32 @@ public class BitcompManager extends PimplManager {
   // If user_stream is specified, the lifetime of the BitcompManager instance must
   // extend beyond that of the user_stream
   public BitcompManager(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts,
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts,
       CUstream_st user_stream/*=0*/, @Cast("nvcomp::ChecksumPolicy") int checksum_policy/*=nvcomp::NoComputeNoVerify*/,
       @Cast("nvcomp::BitstreamKind") int bitstream_kind/*=nvcomp::BitstreamKind::NVCOMP_NATIVE*/) { super((Pointer)null); allocate(uncomp_chunk_size, format_opts, user_stream, checksum_policy, bitstream_kind); }
   private native void allocate(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts,
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts,
       CUstream_st user_stream/*=0*/, @Cast("nvcomp::ChecksumPolicy") int checksum_policy/*=nvcomp::NoComputeNoVerify*/,
       @Cast("nvcomp::BitstreamKind") int bitstream_kind/*=nvcomp::BitstreamKind::NVCOMP_NATIVE*/);
   public BitcompManager(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts) { super((Pointer)null); allocate(uncomp_chunk_size, format_opts); }
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts) { super((Pointer)null); allocate(uncomp_chunk_size, format_opts); }
   private native void allocate(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts);
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts);
 
   // This signature is deprecated, in favour of the one that does not accept a
   // device_id, and instead gets the device from the stream.
   @Deprecated public BitcompManager(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts,
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts,
       CUstream_st user_stream, int device_id, @Cast("nvcomp::ChecksumPolicy") int checksum_policy/*=nvcomp::NoComputeNoVerify*/,
       @Cast("nvcomp::BitstreamKind") int bitstream_kind/*=nvcomp::BitstreamKind::NVCOMP_NATIVE*/) { super((Pointer)null); allocate(uncomp_chunk_size, format_opts, user_stream, device_id, checksum_policy, bitstream_kind); }
   @Deprecated private native void allocate(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts,
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts,
       CUstream_st user_stream, int device_id, @Cast("nvcomp::ChecksumPolicy") int checksum_policy/*=nvcomp::NoComputeNoVerify*/,
       @Cast("nvcomp::BitstreamKind") int bitstream_kind/*=nvcomp::BitstreamKind::NVCOMP_NATIVE*/);
   @Deprecated public BitcompManager(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts,
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts,
       CUstream_st user_stream, int device_id) { super((Pointer)null); allocate(uncomp_chunk_size, format_opts, user_stream, device_id); }
   @Deprecated private native void allocate(
-      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompFormatOpts format_opts,
+      @Cast("size_t") long uncomp_chunk_size, @Const @ByRef nvcompBatchedBitcompOpts_t format_opts,
       CUstream_st user_stream, int device_id);
 }
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/DeflateFormatSpecHeader.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/DeflateFormatSpecHeader.java
index 0ddcdc6b040..3f09f51be05 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/DeflateFormatSpecHeader.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/DeflateFormatSpecHeader.java
@@ -36,6 +36,7 @@ public class DeflateFormatSpecHeader extends Pointer {
 
 /**
  * Compression algorithm to use. Permitted values are:
+ * - 0: highest-throughput, entropy-only compression (use for symmetric compression/decompression performance)
  * - 1: high-throughput, low compression ratio (default)
  * - 2: medium-througput, medium compression ratio, beat Zlib level 1 on the compression ratio
  * - 3: placeholder for further compression level support, will fall into MEDIUM_COMPRESSION at this point
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompAlignmentRequirements_t.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompAlignmentRequirements_t.java
new file mode 100644
index 00000000000..dd81941eb06
--- /dev/null
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompAlignmentRequirements_t.java
@@ -0,0 +1,45 @@
+// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
+
+package org.bytedeco.cuda.nvcomp;
+
+import java.nio.*;
+import org.bytedeco.javacpp.*;
+import org.bytedeco.javacpp.annotation.*;
+
+import static org.bytedeco.javacpp.presets.javacpp.*;
+import org.bytedeco.cuda.cudart.*;
+import static org.bytedeco.cuda.global.cudart.*;
+
+import static org.bytedeco.cuda.global.nvcomp.*;
+
+
+/**
+ * \brief Per-algorithm buffer alignment requirements.
+ */
+@Properties(inherit = org.bytedeco.cuda.presets.nvcomp.class)
+public class nvcompAlignmentRequirements_t extends Pointer {
+    static { Loader.load(); }
+    /** Default native constructor. */
+    public nvcompAlignmentRequirements_t() { super((Pointer)null); allocate(); }
+    /** Native array allocator. Access with {@link Pointer#position(long)}. */
+    public nvcompAlignmentRequirements_t(long size) { super((Pointer)null); allocateArray(size); }
+    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+    public nvcompAlignmentRequirements_t(Pointer p) { super(p); }
+    private native void allocate();
+    private native void allocateArray(long size);
+    @Override public nvcompAlignmentRequirements_t position(long position) {
+        return (nvcompAlignmentRequirements_t)super.position(position);
+    }
+    @Override public nvcompAlignmentRequirements_t getPointer(long i) {
+        return new nvcompAlignmentRequirements_t((Pointer)this).offsetAddress(i);
+    }
+
+    /** Minimum alignment requirement of each input buffer. */
+    public native @Cast("size_t") long input(); public native nvcompAlignmentRequirements_t input(long setter);
+    /** Minimum alignment requirement of each output buffer. */
+    public native @Cast("size_t") long output(); public native nvcompAlignmentRequirements_t output(long setter);
+    /** Minimum alignment requirement of temporary-storage buffer, if any. For
+     *  algorithms that do not use temporary storage, this field is always equal
+     *  to 1. */
+    public native @Cast("size_t") long temp(); public native nvcompAlignmentRequirements_t temp(long setter);
+}
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedBitcompFormatOpts.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedBitcompOpts_t.java
similarity index 64%
rename from cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedBitcompFormatOpts.java
rename to cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedBitcompOpts_t.java
index 1715c26fa8f..9e52603b99a 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedBitcompFormatOpts.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedBitcompOpts_t.java
@@ -22,33 +22,33 @@
  * \brief Structure for configuring Bitcomp compression.
  */
 @Properties(inherit = org.bytedeco.cuda.presets.nvcomp.class)
-public class nvcompBatchedBitcompFormatOpts extends Pointer {
+public class nvcompBatchedBitcompOpts_t extends Pointer {
     static { Loader.load(); }
     /** Default native constructor. */
-    public nvcompBatchedBitcompFormatOpts() { super((Pointer)null); allocate(); }
+    public nvcompBatchedBitcompOpts_t() { super((Pointer)null); allocate(); }
     /** Native array allocator. Access with {@link Pointer#position(long)}. */
-    public nvcompBatchedBitcompFormatOpts(long size) { super((Pointer)null); allocateArray(size); }
+    public nvcompBatchedBitcompOpts_t(long size) { super((Pointer)null); allocateArray(size); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
-    public nvcompBatchedBitcompFormatOpts(Pointer p) { super(p); }
+    public nvcompBatchedBitcompOpts_t(Pointer p) { super(p); }
     private native void allocate();
     private native void allocateArray(long size);
-    @Override public nvcompBatchedBitcompFormatOpts position(long position) {
-        return (nvcompBatchedBitcompFormatOpts)super.position(position);
+    @Override public nvcompBatchedBitcompOpts_t position(long position) {
+        return (nvcompBatchedBitcompOpts_t)super.position(position);
     }
-    @Override public nvcompBatchedBitcompFormatOpts getPointer(long i) {
-        return new nvcompBatchedBitcompFormatOpts((Pointer)this).offsetAddress(i);
+    @Override public nvcompBatchedBitcompOpts_t getPointer(long i) {
+        return new nvcompBatchedBitcompOpts_t((Pointer)this).offsetAddress(i);
     }
 
   /**
    * \brief Bitcomp algorithm options.
    * 
    * - 0 : Default algorithm, usually gives the best compression ratios
-   * - 1 : "Sparse" algorithm, works well on sparse data (with lots of zeroes).
-   *        and is usually a faster than the default algorithm.
+   * - 1 : "Sparse" algorithm, works well on sparse data (with lots of zeroes)
+   *        and is usually faster than the default algorithm.
    */
-  public native int algorithm_type(); public native nvcompBatchedBitcompFormatOpts algorithm_type(int setter);
+  public native int algorithm_type(); public native nvcompBatchedBitcompOpts_t algorithm_type(int setter);
   /**
    * \brief One of nvcomp's possible data types
    */
-  public native @Cast("nvcompType_t") int data_type(); public native nvcompBatchedBitcompFormatOpts data_type(int setter);
+  public native @Cast("nvcompType_t") int data_type(); public native nvcompBatchedBitcompOpts_t data_type(int setter);
 }
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedCascadedOpts_t.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedCascadedOpts_t.java
index 477df87ead7..55fa625429a 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedCascadedOpts_t.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedCascadedOpts_t.java
@@ -12,6 +12,7 @@
 
 import static org.bytedeco.cuda.global.nvcomp.*;
 
+// #endif
 
 /******************************************************************************
  * Batched compression/decompression interface
@@ -39,7 +40,7 @@ public class nvcompBatchedCascadedOpts_t extends Pointer {
     }
 
   /**
-   * \brief The size of each internal chunk of data to decompress indepentently with
+   * \brief The size of each internal chunk of data to decompress independently with
    * 
    * Cascaded compression. The value should be in the range of [512, 16384]
    * depending on the datatype of the input and the shared memory size of
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedDeflateOpts_t.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedDeflateOpts_t.java
index 54a9a58681b..ed9b0f728f3 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedDeflateOpts_t.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedDeflateOpts_t.java
@@ -41,6 +41,7 @@ public class nvcompBatchedDeflateOpts_t extends Pointer {
 
 /**
  * Compression algorithm to use. Permitted values are:
+ * - 0: highest-throughput, entropy-only compression (use for symmetric compression/decompression performance)
  * - 1: high-throughput, low compression ratio (default)
  * - 2: medium-througput, medium compression ratio, beat Zlib level 1 on the compression ratio
  * - 3: placeholder for further compression level support, will fall into MEDIUM_COMPRESSION at this point
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedGzipOpts_t.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedGzipOpts_t.java
new file mode 100644
index 00000000000..a23cf8e559d
--- /dev/null
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedGzipOpts_t.java
@@ -0,0 +1,38 @@
+// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
+
+package org.bytedeco.cuda.nvcomp;
+
+import java.nio.*;
+import org.bytedeco.javacpp.*;
+import org.bytedeco.javacpp.annotation.*;
+
+import static org.bytedeco.javacpp.presets.javacpp.*;
+import org.bytedeco.cuda.cudart.*;
+import static org.bytedeco.cuda.global.cudart.*;
+
+import static org.bytedeco.cuda.global.nvcomp.*;
+
+
+/**
+ * Gzip compression options for the low-level API
+ */
+@Properties(inherit = org.bytedeco.cuda.presets.nvcomp.class)
+public class nvcompBatchedGzipOpts_t extends Pointer {
+    static { Loader.load(); }
+    /** Default native constructor. */
+    public nvcompBatchedGzipOpts_t() { super((Pointer)null); allocate(); }
+    /** Native array allocator. Access with {@link Pointer#position(long)}. */
+    public nvcompBatchedGzipOpts_t(long size) { super((Pointer)null); allocateArray(size); }
+    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+    public nvcompBatchedGzipOpts_t(Pointer p) { super(p); }
+    private native void allocate();
+    private native void allocateArray(long size);
+    @Override public nvcompBatchedGzipOpts_t position(long position) {
+        return (nvcompBatchedGzipOpts_t)super.position(position);
+    }
+    @Override public nvcompBatchedGzipOpts_t getPointer(long i) {
+        return new nvcompBatchedGzipOpts_t((Pointer)this).offsetAddress(i);
+    }
+
+  public native int reserved(); public native nvcompBatchedGzipOpts_t reserved(int setter);
+}
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedLZ4Opts_t.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedLZ4Opts_t.java
index c723daf360e..b690121ecac 100644
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedLZ4Opts_t.java
+++ b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompBatchedLZ4Opts_t.java
@@ -12,6 +12,11 @@
 
 import static org.bytedeco.cuda.global.nvcomp.*;
 
+// #endif
+
+/******************************************************************************
+ * Batched compression/decompression interface for LZ4
+ *****************************************************************************/
 
 /**
  * LZ4 compression options for the low-level API
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompCascadedFormatOpts.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompCascadedFormatOpts.java
deleted file mode 100644
index 36dd42a60bf..00000000000
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompCascadedFormatOpts.java
+++ /dev/null
@@ -1,52 +0,0 @@
-// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
-
-package org.bytedeco.cuda.nvcomp;
-
-import java.nio.*;
-import org.bytedeco.javacpp.*;
-import org.bytedeco.javacpp.annotation.*;
-
-import static org.bytedeco.javacpp.presets.javacpp.*;
-import org.bytedeco.cuda.cudart.*;
-import static org.bytedeco.cuda.global.cudart.*;
-
-import static org.bytedeco.cuda.global.nvcomp.*;
-
-// #endif
-
-/**
- * \brief Structure that stores the compression configuration
- */
-@Properties(inherit = org.bytedeco.cuda.presets.nvcomp.class)
-public class nvcompCascadedFormatOpts extends Pointer {
-    static { Loader.load(); }
-    /** Default native constructor. */
-    public nvcompCascadedFormatOpts() { super((Pointer)null); allocate(); }
-    /** Native array allocator. Access with {@link Pointer#position(long)}. */
-    public nvcompCascadedFormatOpts(long size) { super((Pointer)null); allocateArray(size); }
-    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
-    public nvcompCascadedFormatOpts(Pointer p) { super(p); }
-    private native void allocate();
-    private native void allocateArray(long size);
-    @Override public nvcompCascadedFormatOpts position(long position) {
-        return (nvcompCascadedFormatOpts)super.position(position);
-    }
-    @Override public nvcompCascadedFormatOpts getPointer(long i) {
-        return new nvcompCascadedFormatOpts((Pointer)this).offsetAddress(i);
-    }
-
-  /**
-   * \brief The number of Run Length Encodings to perform.
-   */
-  public native int num_RLEs(); public native nvcompCascadedFormatOpts num_RLEs(int setter);
-
-  /**
-   * \brief The number of Delta Encodings to perform.
-   */
-  public native int num_deltas(); public native nvcompCascadedFormatOpts num_deltas(int setter);
-
-  /**
-   * \brief Whether or not to bitpack the final layers.
-   */
-  public native int use_bp(); public native nvcompCascadedFormatOpts use_bp(int setter);
-}
diff --git a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompLZ4FormatOpts.java b/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompLZ4FormatOpts.java
deleted file mode 100644
index 7765b9848ab..00000000000
--- a/cuda/src/gen/java/org/bytedeco/cuda/nvcomp/nvcompLZ4FormatOpts.java
+++ /dev/null
@@ -1,49 +0,0 @@
-// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
-
-package org.bytedeco.cuda.nvcomp;
-
-import java.nio.*;
-import org.bytedeco.javacpp.*;
-import org.bytedeco.javacpp.annotation.*;
-
-import static org.bytedeco.javacpp.presets.javacpp.*;
-import org.bytedeco.cuda.cudart.*;
-import static org.bytedeco.cuda.global.cudart.*;
-
-import static org.bytedeco.cuda.global.nvcomp.*;
-
-// #endif
-
-/******************************************************************************
- * Batched compression/decompression interface for LZ4
- *****************************************************************************/
-
-/**
- * \brief Structure for configuring LZ4 compression.
- */
-@Properties(inherit = org.bytedeco.cuda.presets.nvcomp.class)
-public class nvcompLZ4FormatOpts extends Pointer {
-    static { Loader.load(); }
-    /** Default native constructor. */
-    public nvcompLZ4FormatOpts() { super((Pointer)null); allocate(); }
-    /** Native array allocator. Access with {@link Pointer#position(long)}. */
-    public nvcompLZ4FormatOpts(long size) { super((Pointer)null); allocateArray(size); }
-    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
-    public nvcompLZ4FormatOpts(Pointer p) { super(p); }
-    private native void allocate();
-    private native void allocateArray(long size);
-    @Override public nvcompLZ4FormatOpts position(long position) {
-        return (nvcompLZ4FormatOpts)super.position(position);
-    }
-    @Override public nvcompLZ4FormatOpts getPointer(long i) {
-        return new nvcompLZ4FormatOpts((Pointer)this).offsetAddress(i);
-    }
-
-  /**
-   * \brief The size of each chunk of data to decompress indepentently with
-   * LZ4. Must be within the range of [32768, 16777216]. Larger sizes will
-   * result in higher compression, but with decreased parallelism. The
-   * recommended size is 65536.
-   */
-  public native @Cast("size_t") long chunk_size(); public native nvcompLZ4FormatOpts chunk_size(long setter);
-}
diff --git a/ffmpeg/src/gen/java/org/bytedeco/ffmpeg/global/postproc.java b/ffmpeg/src/gen/java/org/bytedeco/ffmpeg/global/postproc.java
index 86aa7b289d5..26c73702a77 100644
--- a/ffmpeg/src/gen/java/org/bytedeco/ffmpeg/global/postproc.java
+++ b/ffmpeg/src/gen/java/org/bytedeco/ffmpeg/global/postproc.java
@@ -217,7 +217,7 @@ public class postproc extends org.bytedeco.ffmpeg.presets.postproc {
 
 // #include "version_major.h"
 
-public static final int LIBPOSTPROC_VERSION_MINOR =   1;
+public static final int LIBPOSTPROC_VERSION_MINOR =   3;
 public static final int LIBPOSTPROC_VERSION_MICRO = 100;
 
 // #define LIBPOSTPROC_VERSION_INT AV_VERSION_INT(LIBPOSTPROC_VERSION_MAJOR,
diff --git a/gsl/README.md b/gsl/README.md
index ecc9c9bcef9..e0b83472af5 100644
--- a/gsl/README.md
+++ b/gsl/README.md
@@ -53,7 +53,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/gsl/samples/pom.xml b/gsl/samples/pom.xml
index 445b4e82444..f268edb249b 100644
--- a/gsl/samples/pom.xml
+++ b/gsl/samples/pom.xml
@@ -19,7 +19,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/leptonica/cppbuild.sh b/leptonica/cppbuild.sh
index 8cedbc84657..c1e32fb0d65 100755
--- a/leptonica/cppbuild.sh
+++ b/leptonica/cppbuild.sh
@@ -71,7 +71,7 @@ cd ..
 export PATH=$INSTALL_PATH/bin:$PATH
 export PKG_CONFIG_PATH=$INSTALL_PATH/lib/pkgconfig/
 
-CMAKE_CONFIG="-DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$INSTALL_PATH -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_INSTALL_LIBDIR=$INSTALL_PATH/lib -DBUILD_SHARED_LIBS=OFF -DENABLE_SHARED=FALSE -DPNG_SHARED=OFF"
+CMAKE_CONFIG="-DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$INSTALL_PATH -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_INSTALL_LIBDIR=$INSTALL_PATH/lib -DBUILD_SHARED_LIBS=OFF -DENABLE_SHARED=FALSE -DPNG_SHARED=OFF -DPNG_FRAMEWORK=OFF"
 WEBP_CONFIG="-DWEBP_BUILD_ANIM_UTILS=OFF -DWEBP_BUILD_CWEBP=OFF -DWEBP_BUILD_DWEBP=OFF -DWEBP_BUILD_EXTRAS=OFF -DWEBP_BUILD_GIF2WEBP=OFF -DWEBP_BUILD_IMG2WEBP=OFF -DWEBP_BUILD_VWEBP=OFF -DWEBP_BUILD_WEBPINFO=OFF -DWEBP_BUILD_WEBPMUX=OFF -DWEBP_BUILD_WEBP_JS=OFF"
 
 case $PLATFORM in
diff --git a/llvm/README.md b/llvm/README.md
index 217e42a15e8..6e03030df4c 100644
--- a/llvm/README.md
+++ b/llvm/README.md
@@ -9,7 +9,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * LLVM 19.1.2  http://llvm.org/
+ * LLVM 19.1.3  http://llvm.org/
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -50,7 +50,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>llvm-platform</artifactId>
-            <version>19.1.2-1.5.11-SNAPSHOT</version>
+            <version>19.1.3-1.5.11-SNAPSHOT</version>
         </dependency>
     </dependencies>
     <build>
diff --git a/llvm/cppbuild.sh b/llvm/cppbuild.sh
index 653c2e85473..5fe69a69a9a 100755
--- a/llvm/cppbuild.sh
+++ b/llvm/cppbuild.sh
@@ -7,7 +7,7 @@ if [[ -z "$PLATFORM" ]]; then
     exit
 fi
 
-LLVM_VERSION=19.1.2
+LLVM_VERSION=19.1.3
 download https://github.com/llvm/llvm-project/releases/download/llvmorg-$LLVM_VERSION/llvm-project-$LLVM_VERSION.src.tar.xz llvm-project-$LLVM_VERSION.src.tar.xz
 
 mkdir -p $PLATFORM
diff --git a/llvm/platform/pom.xml b/llvm/platform/pom.xml
index f43565e4d7e..14072e30071 100644
--- a/llvm/platform/pom.xml
+++ b/llvm/platform/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>llvm-platform</artifactId>
-  <version>19.1.2-${project.parent.version}</version>
+  <version>19.1.3-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for LLVM</name>
 
   <properties>
diff --git a/llvm/pom.xml b/llvm/pom.xml
index 364f82456d3..f4775feac05 100644
--- a/llvm/pom.xml
+++ b/llvm/pom.xml
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>llvm</artifactId>
-  <version>19.1.2-${project.parent.version}</version>
+  <version>19.1.3-${project.parent.version}</version>
   <name>JavaCPP Presets for LLVM</name>
 
   <dependencies>
@@ -102,7 +102,6 @@
               <compilerOptions>
                 <compilerOption>/link</compilerOption>
                 <compilerOption>/FORCE:MULTIPLE</compilerOption>
-                <compilerOption>/WHOLEARCHIVE</compilerOption>
               </compilerOptions>
             </configuration>
           </plugin>
diff --git a/llvm/samples/clang/pom.xml b/llvm/samples/clang/pom.xml
index 9da5c0aec10..07fc1f8fdff 100644
--- a/llvm/samples/clang/pom.xml
+++ b/llvm/samples/clang/pom.xml
@@ -12,7 +12,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>llvm-platform</artifactId>
-            <version>19.1.2-1.5.11-SNAPSHOT</version>
+            <version>19.1.3-1.5.11-SNAPSHOT</version>
         </dependency>
     </dependencies>
     <build>
diff --git a/llvm/samples/llvm/pom.xml b/llvm/samples/llvm/pom.xml
index 2785b5727e2..943058e4c6b 100644
--- a/llvm/samples/llvm/pom.xml
+++ b/llvm/samples/llvm/pom.xml
@@ -12,7 +12,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>llvm-platform</artifactId>
-            <version>19.1.2-1.5.11-SNAPSHOT</version>
+            <version>19.1.3-1.5.11-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.bytedeco</groupId>
diff --git a/llvm/samples/polly/pom.xml b/llvm/samples/polly/pom.xml
index 4b080d15433..b0b0ef821b7 100644
--- a/llvm/samples/polly/pom.xml
+++ b/llvm/samples/polly/pom.xml
@@ -13,7 +13,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>llvm-platform</artifactId>
-            <version>19.1.2-1.5.11-SNAPSHOT</version>
+            <version>19.1.3-1.5.11-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.bytedeco</groupId>
@@ -23,12 +23,12 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
     </dependencies>
     <build>
diff --git a/llvm/src/gen/java/org/bytedeco/llvm/LLVM/LLVMOpaqueDbgRecord.java b/llvm/src/gen/java/org/bytedeco/llvm/LLVM/LLVMOpaqueDbgRecord.java
deleted file mode 100644
index 731db67f6b6..00000000000
--- a/llvm/src/gen/java/org/bytedeco/llvm/LLVM/LLVMOpaqueDbgRecord.java
+++ /dev/null
@@ -1,23 +0,0 @@
-// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
-
-package org.bytedeco.llvm.LLVM;
-
-import java.nio.*;
-import org.bytedeco.javacpp.*;
-import org.bytedeco.javacpp.annotation.*;
-
-import static org.bytedeco.javacpp.presets.javacpp.*;
-
-import static org.bytedeco.llvm.global.LLVM.*;
-
-
-/**
- * @see llvm::DbgRecord
- */
-@Opaque @Properties(inherit = org.bytedeco.llvm.presets.LLVM.class)
-public class LLVMOpaqueDbgRecord extends Pointer {
-    /** Empty constructor. Calls {@code super((Pointer)null)}. */
-    public LLVMOpaqueDbgRecord() { super((Pointer)null); }
-    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
-    public LLVMOpaqueDbgRecord(Pointer p) { super(p); }
-}
diff --git a/mkl/README.md b/mkl/README.md
index dc0f0da4e29..bfbe6931a7c 100644
--- a/mkl/README.md
+++ b/mkl/README.md
@@ -9,7 +9,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * MKL 2024.2.2  https://software.intel.com/mkl
+ * MKL 2025.0.0  https://software.intel.com/mkl
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -48,14 +48,14 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled full version of MKL -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/mkl/platform/pom.xml b/mkl/platform/pom.xml
index ddcd5e42a49..2d65f60a51b 100644
--- a/mkl/platform/pom.xml
+++ b/mkl/platform/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>mkl-platform</artifactId>
-  <version>2024.2-${project.parent.version}</version>
+  <version>2025.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for MKL</name>
 
   <properties>
diff --git a/mkl/platform/redist/pom.xml b/mkl/platform/redist/pom.xml
index 27c834abced..922a371fbc8 100644
--- a/mkl/platform/redist/pom.xml
+++ b/mkl/platform/redist/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>mkl-platform-redist</artifactId>
-  <version>2024.2-${project.parent.version}</version>
+  <version>2025.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform Redist for MKL</name>
 
   <properties>
diff --git a/mkl/pom.xml b/mkl/pom.xml
index 4eb85481757..4c658303c48 100644
--- a/mkl/pom.xml
+++ b/mkl/pom.xml
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>mkl</artifactId>
-  <version>2024.2-${project.parent.version}</version>
+  <version>2025.0-${project.parent.version}</version>
   <name>JavaCPP Presets for MKL</name>
 
   <dependencies>
diff --git a/mkl/samples/pom.xml b/mkl/samples/pom.xml
index 4741e9b9647..0c3865c9c89 100644
--- a/mkl/samples/pom.xml
+++ b/mkl/samples/pom.xml
@@ -12,14 +12,14 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled full version of MKL -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/mkl/src/gen/java/org/bytedeco/mkl/global/mkl_rt.java b/mkl/src/gen/java/org/bytedeco/mkl/global/mkl_rt.java
index 951ef2aca26..1280cac022f 100644
--- a/mkl/src/gen/java/org/bytedeco/mkl/global/mkl_rt.java
+++ b/mkl/src/gen/java/org/bytedeco/mkl/global/mkl_rt.java
@@ -83,13 +83,14 @@ public class mkl_rt extends org.bytedeco.mkl.presets.mkl_rt {
 // #ifndef _MKL_VERSION_H_
 // #define _MKL_VERSION_H_
 
-public static final int __INTEL_MKL_BUILD_DATE = 20240823;
+public static final int __INTEL_MKL_BUILD_DATE = 20241009;
 
-public static final int __INTEL_MKL__ =          2024;
+public static final int __INTEL_MKL__ =          2025;
 public static final int __INTEL_MKL_MINOR__ =    0;
-public static final int __INTEL_MKL_UPDATE__ =   2;
+public static final int __INTEL_MKL_UPDATE__ =   0;
+public static final int __INTEL_MKL_PATCH__ =    0;
 
-public static final int INTEL_MKL_VERSION =      20240002;
+public static final int INTEL_MKL_VERSION =      20250000;
 
 // #endif
 
@@ -97,7 +98,7 @@ public class mkl_rt extends org.bytedeco.mkl.presets.mkl_rt {
 // Parsed from mkl_types.h
 
 /*******************************************************************************
-* Copyright 1999-2022 Intel Corporation.
+* Copyright 1999 Intel Corporation.
 *
 * This software and the related documents are Intel copyrighted  materials,  and
 * your use of  them is  governed by the  express license  under which  they were
@@ -150,6 +151,7 @@ public static class MKLVersion extends Pointer {
     public native int MajorVersion(); public native MKLVersion MajorVersion(int setter);
     public native int MinorVersion(); public native MKLVersion MinorVersion(int setter);
     public native int UpdateVersion(); public native MKLVersion UpdateVersion(int setter);
+    public native int PatchVersion(); public native MKLVersion PatchVersion(int setter);
     public native @Cast("char*") BytePointer ProductStatus(); public native MKLVersion ProductStatus(BytePointer setter);
     public native @Cast("char*") BytePointer Build(); public native MKLVersion Build(BytePointer setter);
     public native @Cast("char*") BytePointer Processor(); public native MKLVersion Processor(BytePointer setter);
@@ -2451,6 +2453,24 @@ public static native void cblas_strmm(@Cast("const CBLAS_LAYOUT") int Layout, @C
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  float alpha, @Const float[] A, int lda,
                  float[] B, int ldb);
+public static native void cblas_strmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    float alpha, @Const FloatPointer A, int lda,
+                    @Const FloatPointer B, int ldb, float beta,
+                    FloatPointer C, int ldc);
+public static native void cblas_strmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    float alpha, @Const FloatBuffer A, int lda,
+                    @Const FloatBuffer B, int ldb, float beta,
+                    FloatBuffer C, int ldc);
+public static native void cblas_strmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    float alpha, @Const float[] A, int lda,
+                    @Const float[] B, int ldb, float beta,
+                    float[] C, int ldc);
 public static native void cblas_strsm(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
                  @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
@@ -2466,6 +2486,24 @@ public static native void cblas_strsm(@Cast("const CBLAS_LAYOUT") int Layout, @C
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  float alpha, @Const float[] A, int lda,
                  float[] B, int ldb);
+public static native void cblas_strsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    float alpha, @Const FloatPointer A, int lda,
+                    @Const FloatPointer B, int ldb, float beta,
+                    FloatPointer C, int ldc);
+public static native void cblas_strsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    float alpha, @Const FloatBuffer A, int lda,
+                    @Const FloatBuffer B, int ldb, float beta,
+                    FloatBuffer C, int ldc);
+public static native void cblas_strsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    float alpha, @Const float[] A, int lda,
+                    @Const float[] B, int ldb, float beta,
+                    float[] C, int ldc);
 public static native void cblas_strsm_batch(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE*") IntPointer Side_Array,
                        @Cast("const CBLAS_UPLO*") IntPointer Uplo_Array, @Cast("const CBLAS_TRANSPOSE*") IntPointer TransA_Array,
                        @Cast("const CBLAS_DIAG*") IntPointer Diag_Array, @Const IntPointer M_Array,
@@ -2686,6 +2724,24 @@ public static native void cblas_dtrmm(@Cast("const CBLAS_LAYOUT") int Layout, @C
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  double alpha, @Const double[] A, int lda,
                  double[] B, int ldb);
+public static native void cblas_dtrmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    double alpha, @Const DoublePointer A, int lda,
+                    @Const DoublePointer B, int ldb, double beta,
+                    DoublePointer C, int ldc);
+public static native void cblas_dtrmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    double alpha, @Const DoubleBuffer A, int lda,
+                    @Const DoubleBuffer B, int ldb, double beta,
+                    DoubleBuffer C, int ldc);
+public static native void cblas_dtrmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    double alpha, @Const double[] A, int lda,
+                    @Const double[] B, int ldb, double beta,
+                    double[] C, int ldc);
 public static native void cblas_dtrsm(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
                  @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
@@ -2701,6 +2757,24 @@ public static native void cblas_dtrsm(@Cast("const CBLAS_LAYOUT") int Layout, @C
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  double alpha, @Const double[] A, int lda,
                  double[] B, int ldb);
+public static native void cblas_dtrsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    double alpha, @Const DoublePointer A, int lda,
+                    @Const DoublePointer B, int ldb, double beta,
+                    DoublePointer C, int ldc);
+public static native void cblas_dtrsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    double alpha, @Const DoubleBuffer A, int lda,
+                    @Const DoubleBuffer B, int ldb, double beta,
+                    DoubleBuffer C, int ldc);
+public static native void cblas_dtrsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    double alpha, @Const double[] A, int lda,
+                    @Const double[] B, int ldb, double beta,
+                    double[] C, int ldc);
 public static native void cblas_dtrsm_batch(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE*") IntPointer Side_Array,
                        @Cast("const CBLAS_UPLO*") IntPointer Uplo_Array, @Cast("const CBLAS_TRANSPOSE*") IntPointer Transa_Array,
                        @Cast("const CBLAS_DIAG*") IntPointer Diag_Array, @Const IntPointer M_Array,
@@ -2873,11 +2947,23 @@ public static native void cblas_ctrmm(@Cast("const CBLAS_LAYOUT") int Layout, @C
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  @Const Pointer alpha, @Const Pointer A, int lda,
                  Pointer B, int ldb);
+public static native void cblas_ctrmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    @Const Pointer alpha, @Const Pointer A, int lda,
+                    @Const Pointer B, int ldb, @Const Pointer beta,
+                    Pointer C, int ldc);
 public static native void cblas_ctrsm(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
                  @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  @Const Pointer alpha, @Const Pointer A, int lda,
                  Pointer B, int ldb);
+public static native void cblas_ctrsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    @Const Pointer alpha, @Const Pointer A, int lda,
+                    @Const Pointer B, int ldb, @Const Pointer beta,
+                    Pointer C, int ldc);
 public static native void cblas_ctrsm_batch(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE*") IntPointer Side_Array,
                        @Cast("const CBLAS_UPLO*") IntPointer Uplo_Array, @Cast("const CBLAS_TRANSPOSE*") IntPointer Transa_Array,
                        @Cast("const CBLAS_DIAG*") IntPointer Diag_Array, @Const IntPointer M_Array,
@@ -3036,11 +3122,23 @@ public static native void cblas_ztrmm(@Cast("const CBLAS_LAYOUT") int Layout, @C
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  @Const Pointer alpha, @Const Pointer A, int lda,
                  Pointer B, int ldb);
+public static native void cblas_ztrmm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    @Const Pointer alpha, @Const Pointer A, int lda,
+                    @Const Pointer B, int ldb, @Const Pointer beta,
+                    Pointer C, int ldc);
 public static native void cblas_ztrsm(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
                  @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
                  @Cast("const CBLAS_DIAG") int Diag, int M, int N,
                  @Const Pointer alpha, @Const Pointer A, int lda,
                  Pointer B, int ldb);
+public static native void cblas_ztrsm_oop(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE") int Side,
+                    @Cast("const CBLAS_UPLO") int Uplo, @Cast("const CBLAS_TRANSPOSE") int TransA,
+                    @Cast("const CBLAS_DIAG") int Diag, int M, int N,
+                    @Const Pointer alpha, @Const Pointer A, int lda,
+                    @Const Pointer B, int ldb, @Const Pointer beta,
+                    Pointer C, int ldc);
 public static native void cblas_ztrsm_batch(@Cast("const CBLAS_LAYOUT") int Layout, @Cast("const CBLAS_SIDE*") IntPointer Side_Array,
                        @Cast("const CBLAS_UPLO*") IntPointer Uplo_Array, @Cast("const CBLAS_TRANSPOSE*") IntPointer Transa_Array,
                        @Cast("const CBLAS_DIAG*") IntPointer Diag_Array, @Const IntPointer M_Array,
@@ -233226,7 +233324,6 @@ public static class sparse_struct extends Pointer {
 /* Single Dynamic library threading */
 public static final int MKL_THREADING_INTEL =         0;
 public static final int MKL_THREADING_SEQUENTIAL =    1;
-public static final int MKL_THREADING_PGI =           2;
 public static final int MKL_THREADING_GNU =           3;
 public static final int MKL_THREADING_TBB =           4;
 public static native int MKL_Set_Threading_Layer(int code);
@@ -233678,6 +233775,8 @@ public static class USRFCNXS extends FunctionPointer {
 // #ifndef __MKL_VML_DEFINES_H__
 // #define __MKL_VML_DEFINES_H__
 
+// #include <stdint.h>
+
 // #ifdef __cplusplus
 // #endif /* __cplusplus */
 
@@ -233709,9 +233808,12 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_HA = 0x00000002;
 public static final int VML_EP = 0x00000003;
 
-public static final int VML_LA_64 = 0x0000000000000001;
-public static final int VML_HA_64 = 0x0000000000000002;
-public static final int VML_EP_64 = 0x0000000000000003;
+public static native @MemberGetter int VML_LA_64();
+public static final int VML_LA_64 = VML_LA_64();
+public static native @MemberGetter int VML_HA_64();
+public static final int VML_HA_64 = VML_HA_64();
+public static native @MemberGetter int VML_EP_64();
+public static final int VML_EP_64 = VML_EP_64();
 
 /*
 //  SETTING OPTIMAL FLOATING-POINT PRECISION AND ROUNDING MODE
@@ -233739,10 +233841,14 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_DOUBLE_CONSISTENT = 0x00000020;
 public static final int VML_RESTORE =           0x00000030;
 
-public static final int VML_DEFAULT_PRECISION_64 = 0x0000000000000000;
-public static final int VML_FLOAT_CONSISTENT_64 =  0x0000000000000010;
-public static final int VML_DOUBLE_CONSISTENT_64 = 0x0000000000000020;
-public static final int VML_RESTORE_64 =           0x0000000000000030;
+public static native @MemberGetter int VML_DEFAULT_PRECISION_64();
+public static final int VML_DEFAULT_PRECISION_64 = VML_DEFAULT_PRECISION_64();
+public static native @MemberGetter int VML_FLOAT_CONSISTENT_64();
+public static final int VML_FLOAT_CONSISTENT_64 = VML_FLOAT_CONSISTENT_64();
+public static native @MemberGetter int VML_DOUBLE_CONSISTENT_64();
+public static final int VML_DOUBLE_CONSISTENT_64 = VML_DOUBLE_CONSISTENT_64();
+public static native @MemberGetter int VML_RESTORE_64();
+public static final int VML_RESTORE_64 = VML_RESTORE_64();
 
 /*
 //  VML ERROR HANDLING CONTROL
@@ -233771,14 +233877,20 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_ERRMODE_DEFAULT =  
 VML_ERRMODE_ERRNO | VML_ERRMODE_CALLBACK | VML_ERRMODE_EXCEPT;
 
-public static final int VML_ERRMODE_IGNORE_64 =   0x0000000000000100;
-public static final int VML_ERRMODE_ERRNO_64 =    0x0000000000000200;
-public static final int VML_ERRMODE_STDERR_64 =   0x0000000000000400;
-public static final int VML_ERRMODE_EXCEPT_64 =   0x0000000000000800;
-public static final int VML_ERRMODE_CALLBACK_64 = 0x0000000000001000;
-public static final int VML_ERRMODE_NOERR_64 =    0x0000000000002000;
-public static final int VML_ERRMODE_DEFAULT_64 =  
-VML_ERRMODE_ERRNO_64 | VML_ERRMODE_CALLBACK_64 | VML_ERRMODE_EXCEPT_64;
+public static native @MemberGetter int VML_ERRMODE_IGNORE_64();
+public static final int VML_ERRMODE_IGNORE_64 = VML_ERRMODE_IGNORE_64();
+public static native @MemberGetter int VML_ERRMODE_ERRNO_64();
+public static final int VML_ERRMODE_ERRNO_64 = VML_ERRMODE_ERRNO_64();
+public static native @MemberGetter int VML_ERRMODE_STDERR_64();
+public static final int VML_ERRMODE_STDERR_64 = VML_ERRMODE_STDERR_64();
+public static native @MemberGetter int VML_ERRMODE_EXCEPT_64();
+public static final int VML_ERRMODE_EXCEPT_64 = VML_ERRMODE_EXCEPT_64();
+public static native @MemberGetter int VML_ERRMODE_CALLBACK_64();
+public static final int VML_ERRMODE_CALLBACK_64 = VML_ERRMODE_CALLBACK_64();
+public static native @MemberGetter int VML_ERRMODE_NOERR_64();
+public static final int VML_ERRMODE_NOERR_64 = VML_ERRMODE_NOERR_64();
+public static native @MemberGetter int VML_ERRMODE_DEFAULT_64();
+public static final int VML_ERRMODE_DEFAULT_64 = VML_ERRMODE_DEFAULT_64();
 
 /*
 //  OpenMP(R) number of threads mode macros
@@ -233792,8 +233904,10 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_NUM_THREADS_OMP_AUTO =   0x00000000;
 public static final int VML_NUM_THREADS_OMP_FIXED =  0x00010000;
 
-public static final int VML_NUM_THREADS_OMP_AUTO_64 =   0x0000000000000000;
-public static final int VML_NUM_THREADS_OMP_FIXED_64 =  0x0000000000010000;
+public static native @MemberGetter int VML_NUM_THREADS_OMP_AUTO_64();
+public static final int VML_NUM_THREADS_OMP_AUTO_64 = VML_NUM_THREADS_OMP_AUTO_64();
+public static native @MemberGetter int VML_NUM_THREADS_OMP_FIXED_64();
+public static final int VML_NUM_THREADS_OMP_FIXED_64 = VML_NUM_THREADS_OMP_FIXED_64();
 
 /*
 //  TBB partitioner control macros
@@ -233810,9 +233924,12 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_TBB_PARTITIONER_STATIC = 0x00010000;
 public static final int VML_TBB_PARTITIONER_SIMPLE = 0x00020000;
 
-public static final int VML_TBB_PARTITIONER_AUTO_64 =   0x0000000000000000;
-public static final int VML_TBB_PARTITIONER_STATIC_64 = 0x0000000000010000;
-public static final int VML_TBB_PARTITIONER_SIMPLE_64 = 0x0000000000020000;
+public static native @MemberGetter int VML_TBB_PARTITIONER_AUTO_64();
+public static final int VML_TBB_PARTITIONER_AUTO_64 = VML_TBB_PARTITIONER_AUTO_64();
+public static native @MemberGetter int VML_TBB_PARTITIONER_STATIC_64();
+public static final int VML_TBB_PARTITIONER_STATIC_64 = VML_TBB_PARTITIONER_STATIC_64();
+public static native @MemberGetter int VML_TBB_PARTITIONER_SIMPLE_64();
+public static final int VML_TBB_PARTITIONER_SIMPLE_64 = VML_TBB_PARTITIONER_SIMPLE_64();
 
 /*
 //  FTZ & DAZ mode macros
@@ -233828,9 +233945,12 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_FTZDAZ_OFF =     0x00140000;
 public static final int VML_FTZDAZ_CURRENT = 0x00000000;
 
-public static final int VML_FTZDAZ_ON_64 =      0x0000000000280000;
-public static final int VML_FTZDAZ_OFF_64 =     0x0000000000140000;
-public static final int VML_FTZDAZ_CURRENT_64 = 0x0000000000000000;
+public static native @MemberGetter int VML_FTZDAZ_ON_64();
+public static final int VML_FTZDAZ_ON_64 = VML_FTZDAZ_ON_64();
+public static native @MemberGetter int VML_FTZDAZ_OFF_64();
+public static final int VML_FTZDAZ_OFF_64 = VML_FTZDAZ_OFF_64();
+public static native @MemberGetter int VML_FTZDAZ_CURRENT_64();
+public static final int VML_FTZDAZ_CURRENT_64 = VML_FTZDAZ_CURRENT_64();
 
 /*
 //  Exception trap macros
@@ -233844,10 +233964,14 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_TRAP_OVERFLOW =   0x04000000;
 public static final int VML_TRAP_UNDERFLOW =  0x08000000;
 
-public static final int VML_TRAP_INVALID_64 =    0x0000000001000000;
-public static final int VML_TRAP_DIVBYZERO_64 =  0x0000000002000000;
-public static final int VML_TRAP_OVERFLOW_64 =   0x0000000004000000;
-public static final int VML_TRAP_UNDERFLOW_64 =  0x0000000008000000;
+public static native @MemberGetter int VML_TRAP_INVALID_64();
+public static final int VML_TRAP_INVALID_64 = VML_TRAP_INVALID_64();
+public static native @MemberGetter int VML_TRAP_DIVBYZERO_64();
+public static final int VML_TRAP_DIVBYZERO_64 = VML_TRAP_DIVBYZERO_64();
+public static native @MemberGetter int VML_TRAP_OVERFLOW_64();
+public static final int VML_TRAP_OVERFLOW_64 = VML_TRAP_OVERFLOW_64();
+public static native @MemberGetter int VML_TRAP_UNDERFLOW_64();
+public static final int VML_TRAP_UNDERFLOW_64 = VML_TRAP_UNDERFLOW_64();
 
 /*
 //  ACCURACY, FLOATING-POINT CONTROL, FTZDAZ AND ERROR HANDLING MASKS
@@ -233877,15 +234001,24 @@ public static class USRFCNXS extends FunctionPointer {
 public static final int VML_FTZDAZ_MASK =             0x003C0000;
 public static final int VML_TRAP_EXCEPTIONS_MASK =    0x0F000000;
 
-public static final int VML_ACCURACY_MASK_64 =           0x000000000000000F;
-public static final int VML_FPUMODE_MASK_64 =            0x00000000000000F0;
-public static final int VML_ERRMODE_MASK_64 =            0x000000000000FF00;
-public static final int VML_ERRMODE_STDHANDLER_MASK_64 = 0x0000000000002F00;
-public static final int VML_ERRMODE_CALLBACK_MASK_64 =   0x0000000000001000;
-public static final int VML_NUM_THREADS_OMP_MASK_64 =    0x0000000000030000;
-public static final int VML_TBB_PARTITIONER_MASK_64 =    0x0000000000030000;
-public static final int VML_FTZDAZ_MASK_64 =             0x00000000003C0000;
-public static final int VML_TRAP_EXCEPTIONS_MASK_64 =    0x000000000F000000;
+public static native @MemberGetter int VML_ACCURACY_MASK_64();
+public static final int VML_ACCURACY_MASK_64 = VML_ACCURACY_MASK_64();
+public static native @MemberGetter int VML_FPUMODE_MASK_64();
+public static final int VML_FPUMODE_MASK_64 = VML_FPUMODE_MASK_64();
+public static native @MemberGetter int VML_ERRMODE_MASK_64();
+public static final int VML_ERRMODE_MASK_64 = VML_ERRMODE_MASK_64();
+public static native @MemberGetter int VML_ERRMODE_STDHANDLER_MASK_64();
+public static final int VML_ERRMODE_STDHANDLER_MASK_64 = VML_ERRMODE_STDHANDLER_MASK_64();
+public static native @MemberGetter int VML_ERRMODE_CALLBACK_MASK_64();
+public static final int VML_ERRMODE_CALLBACK_MASK_64 = VML_ERRMODE_CALLBACK_MASK_64();
+public static native @MemberGetter int VML_NUM_THREADS_OMP_MASK_64();
+public static final int VML_NUM_THREADS_OMP_MASK_64 = VML_NUM_THREADS_OMP_MASK_64();
+public static native @MemberGetter int VML_TBB_PARTITIONER_MASK_64();
+public static final int VML_TBB_PARTITIONER_MASK_64 = VML_TBB_PARTITIONER_MASK_64();
+public static native @MemberGetter int VML_FTZDAZ_MASK_64();
+public static final int VML_FTZDAZ_MASK_64 = VML_FTZDAZ_MASK_64();
+public static native @MemberGetter int VML_TRAP_EXCEPTIONS_MASK_64();
+public static final int VML_TRAP_EXCEPTIONS_MASK_64 = VML_TRAP_EXCEPTIONS_MASK_64();
 
 /*
 //  ERROR STATUS MACROS
diff --git a/numpy/README.md b/numpy/README.md
index 467262dcd8e..ebeb065cfc3 100644
--- a/numpy/README.md
+++ b/numpy/README.md
@@ -55,7 +55,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/numpy/samples/pom.xml b/numpy/samples/pom.xml
index 1d6e4031cef..bbc76018ee1 100644
--- a/numpy/samples/pom.xml
+++ b/numpy/samples/pom.xml
@@ -19,7 +19,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/opencv/README.md b/opencv/README.md
index 5c17dba09ec..6acc688ef52 100644
--- a/opencv/README.md
+++ b/opencv/README.md
@@ -70,7 +70,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Optional dependencies to load the Python module -->
diff --git a/opencv/samples/pom.xml b/opencv/samples/pom.xml
index 587f109d2cf..66083df6add 100644
--- a/opencv/samples/pom.xml
+++ b/opencv/samples/pom.xml
@@ -33,7 +33,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Optional dependencies to load the Python module -->
diff --git a/platform/pom.xml b/platform/pom.xml
index 830ba19b126..80fcbe436f7 100644
--- a/platform/pom.xml
+++ b/platform/pom.xml
@@ -172,7 +172,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>mkl-platform</artifactId>
-      <version>2024.2-${project.version}</version>
+      <version>2025.0-${project.version}</version>
     </dependency>
 <!--    <dependency>-->
 <!--      <groupId>org.bytedeco</groupId>-->
@@ -232,7 +232,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>llvm-platform</artifactId>
-      <version>19.1.2-${project.version}</version>
+      <version>19.1.3-${project.version}</version>
     </dependency>
     <dependency>
       <groupId>org.bytedeco</groupId>
@@ -292,7 +292,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>pytorch-platform</artifactId>
-      <version>2.5.0-${project.version}</version>
+      <version>2.5.1-${project.version}</version>
     </dependency>
     <dependency>
       <groupId>org.bytedeco</groupId>
@@ -317,7 +317,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>tritonserver-platform</artifactId>
-      <version>2.50.0-${project.version}</version>
+      <version>2.51.0-${project.version}</version>
     </dependency>
 <!--    <dependency>-->
 <!--      <groupId>org.bytedeco</groupId>-->
diff --git a/pytorch/README.md b/pytorch/README.md
index 5ecaba34d7b..6147925d113 100644
--- a/pytorch/README.md
+++ b/pytorch/README.md
@@ -9,7 +9,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * PyTorch 2.5.0  https://pytorch.org/
+ * PyTorch 2.5.1  https://pytorch.org/
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -48,14 +48,14 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform</artifactId>
-            <version>2.5.0-1.5.11-SNAPSHOT</version>
+            <version>2.5.1-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA, cuDNN, and NCCL -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform-gpu</artifactId>
-            <version>2.5.0-1.5.11-SNAPSHOT</version>
+            <version>2.5.1-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA, cuDNN, and NCCL -->
@@ -69,7 +69,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
     </dependencies>
     <build>
diff --git a/pytorch/cppbuild.sh b/pytorch/cppbuild.sh
index 7434bfa9eeb..ab299ca92df 100755
--- a/pytorch/cppbuild.sh
+++ b/pytorch/cppbuild.sh
@@ -38,7 +38,7 @@ if [[ $PLATFORM == windows* ]]; then
     export PYTHON_BIN_PATH=$(which python.exe)
 fi
 
-PYTORCH_VERSION=2.5.0
+PYTORCH_VERSION=2.5.1
 
 export PYTORCH_BUILD_VERSION="$PYTORCH_VERSION"
 export PYTORCH_BUILD_NUMBER=1
diff --git a/pytorch/platform/gpu/pom.xml b/pytorch/platform/gpu/pom.xml
index 0d8bd29ae81..ed09efb6122 100644
--- a/pytorch/platform/gpu/pom.xml
+++ b/pytorch/platform/gpu/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>pytorch-platform-gpu</artifactId>
-  <version>2.5.0-${project.parent.version}</version>
+  <version>2.5.1-${project.parent.version}</version>
   <name>JavaCPP Presets Platform GPU for PyTorch</name>
 
   <properties>
diff --git a/pytorch/platform/pom.xml b/pytorch/platform/pom.xml
index 4ff46efa3fd..1aa3be3cc72 100644
--- a/pytorch/platform/pom.xml
+++ b/pytorch/platform/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>pytorch-platform</artifactId>
-  <version>2.5.0-${project.parent.version}</version>
+  <version>2.5.1-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for PyTorch</name>
 
   <properties>
diff --git a/pytorch/pom.xml b/pytorch/pom.xml
index 575b759fa79..9a8baf0c921 100644
--- a/pytorch/pom.xml
+++ b/pytorch/pom.xml
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>pytorch</artifactId>
-  <version>2.5.0-${project.parent.version}</version>
+  <version>2.5.1-${project.parent.version}</version>
   <name>JavaCPP Presets for PyTorch</name>
 
   <dependencies>
diff --git a/pytorch/samples/pom.xml b/pytorch/samples/pom.xml
index 97d4fb103ac..8a1fc21f8b2 100644
--- a/pytorch/samples/pom.xml
+++ b/pytorch/samples/pom.xml
@@ -12,14 +12,14 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform</artifactId>
-            <version>2.5.0-1.5.11-SNAPSHOT</version>
+            <version>2.5.1-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA, cuDNN, and NCCL -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>pytorch-platform-gpu</artifactId>
-            <version>2.5.0-1.5.11-SNAPSHOT</version>
+            <version>2.5.1-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA, cuDNN, and NCCL -->
@@ -33,7 +33,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
     </dependencies>
     <build>
diff --git a/scipy/README.md b/scipy/README.md
index 67f4961b97e..73afe1937fb 100644
--- a/scipy/README.md
+++ b/scipy/README.md
@@ -55,7 +55,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/scipy/samples/pom.xml b/scipy/samples/pom.xml
index bec4cdfad9d..8a80ba6b478 100644
--- a/scipy/samples/pom.xml
+++ b/scipy/samples/pom.xml
@@ -19,7 +19,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/tritonserver/README.md b/tritonserver/README.md
index 3c88de03088..6fa8505c6f7 100644
--- a/tritonserver/README.md
+++ b/tritonserver/README.md
@@ -23,7 +23,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * Triton Inference Server 2.50.0  https://github.com/triton-inference-server/server
+ * Triton Inference Server 2.51.0  https://github.com/triton-inference-server/server
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -51,9 +51,9 @@ This sample intends to show how to call the Java-mapped C API of Triton to execu
 
  1. Get the source code of Triton Inference Server to prepare the model repository:
 ```bash
- $ wget https://github.com/triton-inference-server/server/archive/refs/tags/v2.50.0.tar.gz
- $ tar zxvf v2.50.0.tar.gz
- $ cd server-2.50.0/docs/examples/model_repository
+ $ wget https://github.com/triton-inference-server/server/archive/refs/tags/v2.51.0.tar.gz
+ $ tar zxvf v2.51.0.tar.gz
+ $ cd server-2.51.0/docs/examples/model_repository
  $ mkdir models
  $ cd models; cp -a ../simple .
 ```
@@ -61,7 +61,7 @@ Now, this `models` directory will be our model repository.
 
  2. Start the Docker container to run the sample (assuming we are under the `models` directory created above):
 ```bash
- $ docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/tritonserver:24.09-py3 bash
+ $ docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/tritonserver:24.10-py3 bash
  $ apt update
  $ apt install -y openjdk-11-jdk
  $ wget https://archive.apache.org/dist/maven/maven-3/3.8.4/binaries/apache-maven-3.8.4-bin.tar.gz
diff --git a/tritonserver/cppbuild.sh b/tritonserver/cppbuild.sh
index 611be9089e6..4c8f52ae41f 100755
--- a/tritonserver/cppbuild.sh
+++ b/tritonserver/cppbuild.sh
@@ -11,9 +11,9 @@ INCLUDE_DEVELOPER_TOOLS_SERVER=${INCLUDE_DEVELOPER_TOOLS_SERVER:=1}
 
 if [[ ! -f "/opt/tritonserver/include/triton/developer_tools/generic_server_wrapper.h" ]] && [[ ! -f "/opt/tritonserver/lib/libtritondevelopertoolsserver.so" ]] && [[ ${INCLUDE_DEVELOPER_TOOLS_SERVER} -ne 0 ]]; then
     TOOLS_BRANCH=${TOOLS_BRANCH:="https://github.com/triton-inference-server/developer_tools.git"}
-    TOOLS_BRANCH_TAG=${TOOLS_BRANCH_TAG:="r24.09"}
+    TOOLS_BRANCH_TAG=${TOOLS_BRANCH_TAG:="r24.10"}
     TRITON_CORE_REPO=${TRITON_CORE_REPO:="https://github.com/triton-inference-server/core.git"}
-    TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG="r24.09"}
+    TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG="r24.10"}
     TRITON_HOME="/opt/tritonserver"
     BUILD_HOME="$PWD"/tritonbuild
     mkdir -p ${BUILD_HOME} && cd ${BUILD_HOME}
diff --git a/tritonserver/platform/pom.xml b/tritonserver/platform/pom.xml
index 82851eac3d8..82d85d45c6c 100644
--- a/tritonserver/platform/pom.xml
+++ b/tritonserver/platform/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>tritonserver-platform</artifactId>
-  <version>2.50.0-${project.parent.version}</version>
+  <version>2.51.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for Triton Inference Server</name>
 
   <properties>
diff --git a/tritonserver/platform/redist/pom.xml b/tritonserver/platform/redist/pom.xml
index 638848272ad..8f45eb75249 100644
--- a/tritonserver/platform/redist/pom.xml
+++ b/tritonserver/platform/redist/pom.xml
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>tritonserver-platform-redist</artifactId>
-  <version>2.50.0-${project.parent.version}</version>
+  <version>2.51.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform Redist for Triton Inference Server</name>
 
   <properties>
diff --git a/tritonserver/pom.xml b/tritonserver/pom.xml
index 5c6b326cf94..ba6f246af0f 100644
--- a/tritonserver/pom.xml
+++ b/tritonserver/pom.xml
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>tritonserver</artifactId>
-  <version>2.50.0-${project.parent.version}</version>
+  <version>2.51.0-${project.parent.version}</version>
   <name>JavaCPP Presets for Triton Inference Server</name>
 
   <dependencies>
diff --git a/tritonserver/samples/simple/pom.xml b/tritonserver/samples/simple/pom.xml
index 817cab07f5a..9d71732ad40 100644
--- a/tritonserver/samples/simple/pom.xml
+++ b/tritonserver/samples/simple/pom.xml
@@ -12,7 +12,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>tritonserver-platform</artifactId>
-            <version>2.50.0-1.5.11-SNAPSHOT</version>
+            <version>2.51.0-1.5.11-SNAPSHOT</version>
             <classifier>shaded</classifier>
         </dependency>
     </dependencies>
diff --git a/tritonserver/samples/simplecpp/pom.xml b/tritonserver/samples/simplecpp/pom.xml
index fa631c00b08..a790be743b0 100644
--- a/tritonserver/samples/simplecpp/pom.xml
+++ b/tritonserver/samples/simplecpp/pom.xml
@@ -12,7 +12,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>tritonserver-platform</artifactId>
-            <version>2.50.0-1.5.11-SNAPSHOT</version>
+            <version>2.51.0-1.5.11-SNAPSHOT</version>
             <classifier>shaded</classifier>
         </dependency>
         <dependency>
diff --git a/tritonserver/samples/unsupported/pom.xml b/tritonserver/samples/unsupported/pom.xml
index b3a891ddcdc..b4b253724c5 100644
--- a/tritonserver/samples/unsupported/pom.xml
+++ b/tritonserver/samples/unsupported/pom.xml
@@ -23,7 +23,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>tritonserver-platform</artifactId>
-            <version>2.50.0-1.5.11-SNAPSHOT</version>
+            <version>2.51.0-1.5.11-SNAPSHOT</version>
             <classifier>shaded</classifier>
         </dependency>
     </dependencies>
diff --git a/tvm/README.md b/tvm/README.md
index 2ae70e56309..04355eb772c 100644
--- a/tvm/README.md
+++ b/tvm/README.md
@@ -70,7 +70,7 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>
diff --git a/tvm/cppbuild.sh b/tvm/cppbuild.sh
index 3548a565e33..6dfaba124af 100755
--- a/tvm/cppbuild.sh
+++ b/tvm/cppbuild.sh
@@ -202,7 +202,7 @@ cp -a 3rdparty/dlpack/include/dlpack 3rdparty/dmlc-core/include/dmlc ../include
 
 # Adjust the directory structure a bit to facilitate packaging in JAR file
 mkdir -p ../python
-export MODULES=(attr cloudpickle decorator ml_dtypes psutil synr typed_ast tornado typing_extensions tvm)
+export MODULES=(attr cloudpickle decorator ml_dtypes packaging psutil synr typed_ast tornado typing_extensions tvm)
 for MODULE in ${MODULES[@]}; do
     mkdir -p ../python/$MODULE.egg-info
     cp -r $PYTHON_INSTALL_PATH/$MODULE*/$MODULE* ../python/ || true
diff --git a/tvm/platform/gpu/pom.xml b/tvm/platform/gpu/pom.xml
index c811107d28b..6a803d3b8e4 100644
--- a/tvm/platform/gpu/pom.xml
+++ b/tvm/platform/gpu/pom.xml
@@ -29,7 +29,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>llvm-platform</artifactId>
-      <version>19.1.2-${project.parent.version}</version>
+      <version>19.1.3-${project.parent.version}</version>
     </dependency>
     <dependency>
       <groupId>org.bytedeco</groupId>
diff --git a/tvm/platform/pom.xml b/tvm/platform/pom.xml
index 8b3da48192b..f215f56bd77 100644
--- a/tvm/platform/pom.xml
+++ b/tvm/platform/pom.xml
@@ -28,7 +28,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>llvm-platform</artifactId>
-      <version>19.1.2-${project.parent.version}</version>
+      <version>19.1.3-${project.parent.version}</version>
     </dependency>
     <dependency>
       <groupId>org.bytedeco</groupId>
diff --git a/tvm/pom.xml b/tvm/pom.xml
index fb3e5ba4f24..dc418aac99d 100644
--- a/tvm/pom.xml
+++ b/tvm/pom.xml
@@ -23,7 +23,7 @@
     <dependency>
       <groupId>org.bytedeco</groupId>
       <artifactId>llvm</artifactId>
-      <version>19.1.2-${project.parent.version}</version>
+      <version>19.1.3-${project.parent.version}</version>
     </dependency>
     <dependency>
       <groupId>org.bytedeco</groupId>
@@ -61,7 +61,7 @@
           <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>llvm-platform</artifactId>
-            <version>19.1.2-${project.parent.version}</version>
+            <version>19.1.3-${project.parent.version}</version>
           </dependency>
           <dependency>
             <groupId>org.bytedeco</groupId>
diff --git a/tvm/samples/pom.xml b/tvm/samples/pom.xml
index 1299336d265..9c59499ccfd 100644
--- a/tvm/samples/pom.xml
+++ b/tvm/samples/pom.xml
@@ -33,7 +33,7 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>mkl-platform-redist</artifactId>
-            <version>2024.2-1.5.11-SNAPSHOT</version>
+            <version>2025.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
     </dependencies>