From e4f28df214f15513bdafeb5a6bbc3bde115dc63b Mon Sep 17 00:00:00 2001 From: Aman Bhansali <92033532+aman-095@users.noreply.github.com> Date: Tue, 24 Sep 2024 08:59:26 +0530 Subject: [PATCH] feat: add C `ndarray` implementation for `blas/base/ddot` PR-URL: https://github.com/stdlib-js/stdlib/pull/2936 Ref: https://github.com/stdlib-js/stdlib/issues/2039 Reviewed-by: Athan Reines --- .../@stdlib/blas/base/ddot/README.md | 32 ++++++++ .../base/ddot/benchmark/c/benchmark.length.c | 46 ++++++++++- .../blas/base/ddot/examples/c/example.c | 6 ++ .../base/ddot/include/stdlib/blas/base/ddot.h | 5 ++ .../blas/base/ddot/lib/ndarray.native.js | 15 +--- .../@stdlib/blas/base/ddot/manifest.json | 69 +++++++++++----- .../@stdlib/blas/base/ddot/src/addon.c | 22 +++++- .../@stdlib/blas/base/ddot/src/ddot.c | 48 +----------- .../@stdlib/blas/base/ddot/src/ddot_cblas.c | 25 +++++- .../@stdlib/blas/base/ddot/src/ddot_f.c | 24 +++++- .../@stdlib/blas/base/ddot/src/ddot_ndarray.c | 78 +++++++++++++++++++ 11 files changed, 285 insertions(+), 85 deletions(-) create mode 100644 lib/node_modules/@stdlib/blas/base/ddot/src/ddot_ndarray.c diff --git a/lib/node_modules/@stdlib/blas/base/ddot/README.md b/lib/node_modules/@stdlib/blas/base/ddot/README.md index badb6a89ca6..90b7023547a 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/README.md +++ b/lib/node_modules/@stdlib/blas/base/ddot/README.md @@ -227,6 +227,32 @@ The function accepts the following arguments: double c_ddot( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const double *Y, const CBLAS_INT strideY ); ``` +#### c_ddot_ndarray( N, \*X, strideX, offsetX, \*Y, strideY, offsetY ) + +Computes the dot product of two double-precision floating-point vectors using alternative indexing semantics. + +```c +const double x[] = { 4.0, 2.0, -3.0, 5.0, -1.0 }; +const double y[] = { 2.0, 6.0, -1.0, -4.0, 8.0 }; + +double v = c_ddot_ndarray( 5, x, -1, 4, y, -1, 4 ); +// returns -5.0 +``` + +The function accepts the following arguments: + +- **N**: `[in] CBLAS_INT` number of indexed elements. +- **X**: `[in] double*` first input array. +- **strideX**: `[in] CBLAS_INT` index increment for `X`. +- **offsetX**: `[in] CBLAS_INT` starting index for `X`. +- **Y**: `[in] double*` second input array. +- **strideY**: `[in] CBLAS_INT` index increment for `Y`. +- **offsetY**: `[in] CBLAS_INT` starting index for `Y`. + +```c +double c_ddot_ndarray( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, const double *Y, const CBLAS_INT strideY. const CBLAS_INT offsetY ); +``` + @@ -266,6 +292,12 @@ int main( void ) { // Print the result: printf( "dot product: %lf\n", d ); + + // Compute the dot product: + d = c_ddot_ndarray( N, x, strideX, 0, y, strideY, N-1 ); + + // Print the result: + printf( "dot product: %lf\n", d ); } ``` diff --git a/lib/node_modules/@stdlib/blas/base/ddot/benchmark/c/benchmark.length.c b/lib/node_modules/@stdlib/blas/base/ddot/benchmark/c/benchmark.length.c index 91cbd2012cc..362d7489ecc 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/benchmark/c/benchmark.length.c +++ b/lib/node_modules/@stdlib/blas/base/ddot/benchmark/c/benchmark.length.c @@ -94,7 +94,7 @@ static double rand_double( void ) { * @param len array length * @return elapsed time in seconds */ -static double benchmark( int iterations, int len ) { +static double benchmark1( int iterations, int len ) { double elapsed; double x[ len ]; double y[ len ]; @@ -122,6 +122,41 @@ static double benchmark( int iterations, int len ) { return elapsed; } +/** +* Runs a benchmark. +* +* @param iterations number of iterations +* @param len array length +* @return elapsed time in seconds +*/ +static double benchmark2( int iterations, int len ) { + double elapsed; + double x[ len ]; + double y[ len ]; + double z; + double t; + int i; + + for ( i = 0; i < len; i++ ) { + x[ i ] = ( rand_double()*20000.0 ) - 10000.0; + y[ i ] = ( rand_double()*20000.0 ) - 10000.0; + } + z = 0.0; + t = tic(); + for ( i = 0; i < iterations; i++ ) { + z = c_ddot_ndarray( len, x, 1, 0, y, 1, 0 ); + if ( z != z ) { + printf( "should not return NaN\n" ); + break; + } + } + elapsed = tic() - t; + if ( z != z ) { + printf( "should not return NaN\n" ); + } + return elapsed; +} + /** * Main execution sequence. */ @@ -144,7 +179,14 @@ int main( void ) { for ( j = 0; j < REPEATS; j++ ) { count += 1; printf( "# c::%s:len=%d\n", NAME, len ); - elapsed = benchmark( iter, len ); + elapsed = benchmark1( iter, len ); + print_results( iter, elapsed ); + printf( "ok %d benchmark finished\n", count ); + } + for ( j = 0; j < REPEATS; j++ ) { + count += 1; + printf( "# c::%s:ndarray:len=%d\n", NAME, len ); + elapsed = benchmark2( iter, len ); print_results( iter, elapsed ); printf( "ok %d benchmark finished\n", count ); } diff --git a/lib/node_modules/@stdlib/blas/base/ddot/examples/c/example.c b/lib/node_modules/@stdlib/blas/base/ddot/examples/c/example.c index c205020189c..99cc7e8154a 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/examples/c/example.c +++ b/lib/node_modules/@stdlib/blas/base/ddot/examples/c/example.c @@ -36,4 +36,10 @@ int main( void ) { // Print the result: printf( "dot product: %lf\n", d ); + + // Compute the dot product: + d = c_ddot_ndarray( N, x, strideX, 0, y, strideY, N-1 ); + + // Print the result: + printf( "dot product: %lf\n", d ); } diff --git a/lib/node_modules/@stdlib/blas/base/ddot/include/stdlib/blas/base/ddot.h b/lib/node_modules/@stdlib/blas/base/ddot/include/stdlib/blas/base/ddot.h index 2368577ec4b..beb870a066d 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/include/stdlib/blas/base/ddot.h +++ b/lib/node_modules/@stdlib/blas/base/ddot/include/stdlib/blas/base/ddot.h @@ -36,6 +36,11 @@ extern "C" { */ double API_SUFFIX(c_ddot)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const double *Y, const CBLAS_INT strideY ); +/** +* Computes the dot product of two double-precision floating-point vectors using alternative indexing semantics. +*/ +double API_SUFFIX(c_ddot_ndarray)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, const double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ); + #ifdef __cplusplus } #endif diff --git a/lib/node_modules/@stdlib/blas/base/ddot/lib/ndarray.native.js b/lib/node_modules/@stdlib/blas/base/ddot/lib/ndarray.native.js index e22d18098d9..1bdf27f0ed1 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/lib/ndarray.native.js +++ b/lib/node_modules/@stdlib/blas/base/ddot/lib/ndarray.native.js @@ -20,9 +20,7 @@ // MODULES // -var minViewBufferIndex = require( '@stdlib/strided/base/min-view-buffer-index' ); -var offsetView = require( '@stdlib/strided/base/offset-view' ); -var addon = require( './ddot.native.js' ); +var addon = require( './../src/addon.node' ); // MAIN // @@ -49,16 +47,7 @@ var addon = require( './ddot.native.js' ); * // returns -5.0 */ function ddot( N, x, strideX, offsetX, y, strideY, offsetY ) { - var viewX; - var viewY; - - offsetX = minViewBufferIndex( N, strideX, offsetX ); - offsetY = minViewBufferIndex( N, strideY, offsetY ); - - viewX = offsetView( x, offsetX ); - viewY = offsetView( y, offsetY ); - - return addon( N, viewX, strideX, viewY, strideY ); + return addon.ndarray( N, x, strideX, offsetX, y, strideY, offsetY ); } diff --git a/lib/node_modules/@stdlib/blas/base/ddot/manifest.json b/lib/node_modules/@stdlib/blas/base/ddot/manifest.json index f82dd214b89..62aa55f4bdc 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/manifest.json +++ b/lib/node_modules/@stdlib/blas/base/ddot/manifest.json @@ -45,6 +45,7 @@ "libpath": [], "dependencies": [ "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index", "@stdlib/napi/export", "@stdlib/napi/argv", "@stdlib/napi/argv-int64", @@ -58,7 +59,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -66,7 +68,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] }, { @@ -75,7 +78,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -83,7 +87,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] }, @@ -105,6 +110,7 @@ "libpath": [], "dependencies": [ "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index", "@stdlib/napi/export", "@stdlib/napi/argv", "@stdlib/napi/argv-int64", @@ -129,7 +135,8 @@ ], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index" ] }, { @@ -149,7 +156,8 @@ ], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index" ] }, @@ -170,6 +178,7 @@ "libpath": [], "dependencies": [ "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index", "@stdlib/napi/export", "@stdlib/napi/argv", "@stdlib/napi/argv-int64", @@ -183,7 +192,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -191,7 +201,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] }, { @@ -200,7 +211,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -208,7 +220,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] }, @@ -229,6 +242,7 @@ "libpath": [], "dependencies": [ "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index", "@stdlib/napi/export", "@stdlib/napi/argv", "@stdlib/napi/argv-int64", @@ -252,7 +266,8 @@ ], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index" ] }, { @@ -271,7 +286,8 @@ ], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index" ] }, @@ -293,6 +309,7 @@ "libpath": [], "dependencies": [ "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index", "@stdlib/napi/export", "@stdlib/napi/argv", "@stdlib/napi/argv-int64", @@ -317,7 +334,8 @@ ], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index" ] }, { @@ -337,7 +355,8 @@ ], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/min-view-buffer-index" ] }, @@ -347,7 +366,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -356,6 +376,7 @@ "libpath": [], "dependencies": [ "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset", "@stdlib/napi/export", "@stdlib/napi/argv", "@stdlib/napi/argv-int64", @@ -369,7 +390,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -377,7 +399,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] }, { @@ -386,7 +409,8 @@ "blas": "", "wasm": false, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -394,7 +418,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] }, @@ -404,7 +429,8 @@ "blas": "", "wasm": true, "src": [ - "./src/ddot.c" + "./src/ddot.c", + "./src/ddot_ndarray.c" ], "include": [ "./include" @@ -412,7 +438,8 @@ "libraries": [], "libpath": [], "dependencies": [ - "@stdlib/blas/base/shared" + "@stdlib/blas/base/shared", + "@stdlib/strided/base/stride2offset" ] } ] diff --git a/lib/node_modules/@stdlib/blas/base/ddot/src/addon.c b/lib/node_modules/@stdlib/blas/base/ddot/src/addon.c index ea5ab4bb590..500f6b5b26b 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/src/addon.c +++ b/lib/node_modules/@stdlib/blas/base/ddot/src/addon.c @@ -43,4 +43,24 @@ static napi_value addon( napi_env env, napi_callback_info info ) { return v; } -STDLIB_NAPI_MODULE_EXPORT_FCN( addon ) +/** +* Receives JavaScript callback invocation data. +* +* @param env environment under which the function is invoked +* @param info callback data +* @return Node-API value +*/ +static napi_value addon_method( napi_env env, napi_callback_info info ) { + STDLIB_NAPI_ARGV( env, info, argv, argc, 7 ); + STDLIB_NAPI_ARGV_INT64( env, N, argv, 0 ); + STDLIB_NAPI_ARGV_INT64( env, strideX, argv, 2 ); + STDLIB_NAPI_ARGV_INT64( env, offsetX, argv, 3 ); + STDLIB_NAPI_ARGV_INT64( env, strideY, argv, 5 ); + STDLIB_NAPI_ARGV_INT64( env, offsetY, argv, 6 ); + STDLIB_NAPI_ARGV_STRIDED_FLOAT64ARRAY( env, X, N, strideX, argv, 1 ); + STDLIB_NAPI_ARGV_STRIDED_FLOAT64ARRAY( env, Y, N, strideY, argv, 4 ); + STDLIB_NAPI_CREATE_DOUBLE( env, API_SUFFIX(c_ddot_ndarray)( N, X, strideX, offsetX, Y, strideY, offsetY ), v ); + return v; +} + +STDLIB_NAPI_MODULE_EXPORT_FCN_WITH_METHOD( addon, "ndarray", addon_method ) diff --git a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot.c b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot.c index d139591c6ef..272fa259e3c 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot.c +++ b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot.c @@ -18,6 +18,7 @@ #include "stdlib/blas/base/ddot.h" #include "stdlib/blas/base/shared.h" +#include "stdlib/strided/base/stride2offset.h" /** * Computes the dot product of two double-precision floating-point vectors. @@ -30,49 +31,8 @@ * @return the dot product */ double API_SUFFIX(c_ddot)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const double *Y, const CBLAS_INT strideY ) { - double dot; - CBLAS_INT ix; - CBLAS_INT iy; - CBLAS_INT m; - CBLAS_INT i; - - dot = 0.0; - if ( N <= 0 ) { - return dot; - } - // If both strides are equal to `1`, use unrolled loops... - if ( strideX == 1 && strideY == 1 ) { - m = N % 5; - - // If we have a remainder, do a clean-up loop... - if ( m > 0 ) { - for ( i = 0; i < m; i++ ) { - dot += X[ i ] * Y[ i ]; - } - } - if ( N < 5 ) { - return dot; - } - for ( i = m; i < N; i += 5 ) { - dot += ( X[i]*Y[i] ) + ( X[i+1]*Y[i+1] ) + ( X[i+2]*Y[i+2] ) + ( X[i+3]*Y[i+3] ) + ( X[i+4]*Y[i+4] ); - } - return dot; - } - if ( strideX < 0 ) { - ix = (1-N) * strideX; - } else { - ix = 0; - } - if ( strideY < 0 ) { - iy = (1-N) * strideY; - } else { - iy = 0; - } - for ( i = 0; i < N; i++ ) { - dot += X[ ix ] * Y[ iy ]; - ix += strideX; - iy += strideY; - } - return dot; + CBLAS_INT ox = stdlib_strided_stride2offset( N, strideX ); + CBLAS_INT oy = stdlib_strided_stride2offset( N, strideY ); + return API_SUFFIX(c_ddot_ndarray)( N, X, strideX, ox, Y, strideY, oy ); } diff --git a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_cblas.c b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_cblas.c index 53b5dcee26a..c7763569551 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_cblas.c +++ b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_cblas.c @@ -19,17 +19,36 @@ #include "stdlib/blas/base/ddot.h" #include "stdlib/blas/base/ddot_cblas.h" #include "stdlib/blas/base/shared.h" +#include "stdlib/strided/base/min_view_buffer_index.h" /** * Computes the dot product of two double-precision floating-point vectors. * * @param N number of indexed elements -* @param X first array +* @param X first input array * @param strideX X stride length -* @param Y second array +* @param Y second input array * @param strideY Y stride length -* @return the dot product +* @return dot product */ double API_SUFFIX(c_ddot)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const double *Y, const CBLAS_INT strideY ) { return API_SUFFIX(cblas_ddot)( N, X, strideX, Y, strideY ); } + +/** +* Computes the dot product of two double-precision floating-point vectors using alternative indexing semantics. +* +* @param N number of indexed elements +* @param X first input array +* @param strideX X stride length +* @param offsetX starting index for X +* @param Y second input array +* @param strideY Y stride length +* @param offsetY starting index for Y +* @return dot product +*/ +double API_SUFFIX(c_ddot_ndarray)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, const double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ) { + X += stdlib_strided_min_view_buffer_index( N, strideX, offsetX ); // adjust array pointer + Y += stdlib_strided_min_view_buffer_index( N, strideY, offsetY ); // adjust array pointer + return API_SUFFIX(cblas_ddot_ndarray)( N, X, strideX, Y, strideY ); +} diff --git a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_f.c b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_f.c index 9184cc3daf5..dd916e1286e 100644 --- a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_f.c +++ b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_f.c @@ -19,19 +19,41 @@ #include "stdlib/blas/base/ddot.h" #include "stdlib/blas/base/ddot_fortran.h" #include "stdlib/blas/base/shared.h" +#include "stdlib/strided/base/min_view_buffer_index.h" /** * Computes the dot product of two double-precision floating-point vectors. * * @param N number of indexed elements +* @param X first input array +* @param strideX X stride length +* @param Y second input array +* @param strideY Y stride length +* @return dot product +*/ +double API_SUFFIX(c_ddot)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const double *Y, const CBLAS_INT strideY ) { + double dot; + ddotsub( &N, X, &strideX, Y, &strideY, &dot ); + return dot; +} + +/** +* Computes the dot product of two double-precision floating-point vectors using alternative indexing semantics. +* +* @param N number of indexed elements * @param X first array * @param strideX X stride length +* @param offsetX starting index for X * @param Y second array * @param strideY Y stride length +* @param offsetY starting index for Y * @return the dot product */ -double API_SUFFIX(c_ddot)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const double *Y, const CBLAS_INT strideY ) { +double API_SUFFIX(c_ddot_ndarray)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, const double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ) { double dot; + + X += stdlib_strided_min_view_buffer_index( N, strideX, offsetX ); // adjust array pointer + Y += stdlib_strided_min_view_buffer_index( N, strideY, offsetY ); // adjust array pointer ddotsub( &N, X, &strideX, Y, &strideY, &dot ); return dot; } diff --git a/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_ndarray.c b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_ndarray.c new file mode 100644 index 00000000000..aecf91d6a4d --- /dev/null +++ b/lib/node_modules/@stdlib/blas/base/ddot/src/ddot_ndarray.c @@ -0,0 +1,78 @@ +/** +* @license Apache-2.0 +* +* Copyright (c) 2019 The Stdlib Authors. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "stdlib/blas/base/ddot.h" +#include "stdlib/blas/base/shared.h" + +static const CBLAS_INT M = 5; + +/** +* Computes the dot product of two double-precision floating-point vectors using alternative indexing semantics. +* +* @param N number of indexed elements +* @param X first array +* @param strideX X stride length +* @param offsetX starting index for X +* @param Y second array +* @param strideY Y stride length +* @param offsetY starting index for Y +* @return the dot product +*/ +double API_SUFFIX(c_ddot_ndarray)( const CBLAS_INT N, const double *X, const CBLAS_INT strideX, const CBLAS_INT offsetX, const double *Y, const CBLAS_INT strideY, const CBLAS_INT offsetY ) { + double dot; + CBLAS_INT ix; + CBLAS_INT iy; + CBLAS_INT m; + CBLAS_INT i; + + dot = 0.0; + if ( N <= 0 ) { + return dot; + } + ix = offsetX; + iy = offsetY; + + // If both strides are equal to `1`, use unrolled loops... + if ( strideX == 1 && strideY == 1 ) { + m = N % M; + + // If we have a remainder, do a clean-up loop... + if ( m > 0 ) { + for ( i = 0; i < m; i++ ) { + dot += X[ ix ] * Y[ iy ]; + ix += strideX; + iy += strideY; + } + } + if ( N < M ) { + return dot; + } + for ( i = m; i < N; i += M ) { + dot += ( X[ ix ]*Y[ iy ] ) + ( X[ ix+1 ]*Y[ iy+1 ] ) + ( X[ ix+2 ]*Y[ iy+2 ] ) + ( X[ ix+3 ]*Y[ iy+3 ] ) + ( X[ ix+4 ]*Y[ iy+4 ] ); + ix += M; + iy += M; + } + return dot; + } + for ( i = 0; i < N; i++ ) { + dot += X[ ix ] * Y[ iy ]; + ix += strideX; + iy += strideY; + } + return dot; +}