Skip to content

Commit ec70f80

Browse files
authored
Clean up cuda build when functrace is enabled (deeplearning4j#10197)
* Clean up cuda build when functrace is enabled * Fix cuda tad cache usage
1 parent ce64118 commit ec70f80

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+263
-614
lines changed

libnd4j/CMakeLists.txt

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,10 @@ if (SD_CUDA)
113113
message("Jetson nano cublas library is ${CUDA_cublas_LIBRARY} and CuSolver library ${CUDA_cusolver_LIBRARY}")
114114
endif()
115115

116-
117-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler --ptxas-options=-v -Xptxas -O1")
116+
if("${SD_PTXAS}" STREQUAL "ON")
117+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")
118+
endif()
119+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -allow-unsupported-compiler ")
118120
if(SD_KEEP_NVCC_OUTPUT)
119121
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --keep ")
120122
endif()
@@ -157,13 +159,7 @@ foreach(TEMPLATE_FILE ${CUDA_TEMPLATE_FILES})
157159
endforeach()
158160

159161

160-
# Define a function to generate individual CUDA files for each type combination
161-
# Find and replace the existing function:
162-
function(genSingleFunctionCuda TEMPLATE_FILE COMBINATION OUTPUT_DIR)
163-
# ...existing implementation...
164-
endfunction()
165162

166-
# REPLACE WITH THIS IMPLEMENTATION:
167163

168164
function(genSingleFunctionCuda TEMPLATE_FILE COMBINATION OUTPUT_DIR)
169165
# Split the COMBINATION string into a list
@@ -573,8 +569,8 @@ endif()
573569
# -fsanitize=leak
574570
if (SD_ANDROID_BUILD)
575571
set_property(GLOBAL PROPERTY JOB_POOLS one_job=1 two_jobs=2)
576-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O${SD_OPTIMIZATION_LEVEL} -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
577-
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O${SD_OPTIMIZATION_LEVEL} -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
572+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O${SD_OPTIMIZATION_LEVEL} -fPIC -Wno-return-type -Wno-unknown-pragmas -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
573+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O${SD_OPTIMIZATION_LEVEL} -g -fPIC -Wno-return-type -Wno-unknown-pragmas -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
578574
elseif (APPLE)
579575
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm64*" OR "${SD_ARCH}" MATCHES "armv8-a")
580576
set(SD_ARCH armv8-a)
@@ -583,8 +579,8 @@ elseif (APPLE)
583579
endif()
584580

585581

586-
set(CMAKE_CXX_FLAGS_RELEASE "-O${SD_OPTIMIZATION_LEVEL} -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
587-
set(CMAKE_CXX_FLAGS_DEBUG " -O${SD_OPTIMIZATION_LEVEL} -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
582+
set(CMAKE_CXX_FLAGS_RELEASE "-O${SD_OPTIMIZATION_LEVEL} -fPIC -Wno-return-type -Wno-braced-scalar-init -Wno-unknown-pragmas -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
583+
set(CMAKE_CXX_FLAGS_DEBUG " -O${SD_OPTIMIZATION_LEVEL} -g -fPIC -Wno-return-type -Wno-braced-scalar-init -Wno-unknown-pragmas -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
588584
elseif(WIN32)
589585
set(SD_X86_BUILD true)
590586
if (SD_CUDA)

libnd4j/blas/CMakeLists.txt

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT CMAKE_SYSTEM_NAME MATCHES "A
200200
endif()
201201

202202
# Set C++ compiler and flags
203-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -fstack-protector-all -Wall -Wextra -Werror -Wno-error=int-in-bool-context -Wno-unused-variable -Wno-error=implicit-fallthrough -Wno-return-type -Wno-unused-parameter -Wno-error=unknown-pragmas -ggdb3 -lpthread -pthread -MT -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -rdynamic -finstrument-functions -O0 -fPIC")
203+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -fstack-protector-all -Wall -Wextra -Werror -Wno-return-type -Wno-error=int-in-bool-context -Wno-unused-variable -Wno-error=implicit-fallthrough -Wno-return-type -Wno-unused-parameter -Wno-error=unknown-pragmas -ggdb3 -lpthread -pthread -MT -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -rdynamic -finstrument-functions -O0 -fPIC")
204204
add_compile_definitions(SD_GCC_FUNCTRACE)
205205
endif()
206206
endif()
@@ -262,7 +262,7 @@ if(SD_CUDA)
262262

263263
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
264264
if(SD_GCC_FUNCTRACE STREQUAL "ON")
265-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall -Wno-unused-variable -Wno-unused-parameter -Wreturn-type -W -ggdb3 -fPIC -DSD_GCC_FUNCTRACE=1 -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -finstrument-functions -O0")
265+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall -Wno-return-type -Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-parameter -Wreturn-type -W -ggdb3 -fPIC -DSD_GCC_FUNCTRACE=1 -Bsymbolic -lbfd -rdynamic -lunwind -ldw -ldl -fno-omit-frame-pointer -fno-optimize-sibling-calls -finstrument-functions -O0")
266266
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC --device-debug -lineinfo -G")
267267
add_compile_definitions(SD_GCC_FUNCTRACE)
268268
else()
@@ -306,7 +306,7 @@ if(SD_CUDA)
306306
endif()
307307

308308
# Cap the number of registers to prevent resource exhaustion
309-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --maxrregcount=40 ")
309+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -maxrregcount=128 ")
310310

311311
# Define CUDA Architectures
312312
string(TOLOWER "${COMPUTE}" COMPUTE_CMP)
@@ -371,6 +371,11 @@ if(SD_CUDA)
371371
endif()
372372
include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/TypeMST.cmake)
373373

374+
# 2. Process the compilation_units templates
375+
file(GLOB CUDA_COMPILATION_UNITS ../include/loops/cuda/compilation_units/*.cu.in)
376+
foreach(FL_ITEM ${CUDA_COMPILATION_UNITS})
377+
genCompilation(${FL_ITEM})
378+
endforeach()
374379

375380
# Decide whether to use all combinations or optimized MST combinations
376381
set(SD_USE_MST_TYPES ON)
@@ -423,7 +428,7 @@ if(SD_CUDA)
423428
file(GLOB_RECURSE INDEXING_SOURCES ../include/indexing/*.cpp ../include/indexing/*.h)
424429
file(GLOB_RECURSE LOOPS_SOURCES ../include/loops/impl/*.cpp ../include/loops/*.h ../include/loops/*.chpp)
425430
file(GLOB_RECURSE LEGACY_SOURCES ../include/legacy/impl/*.cpp ../include/legacy/*.cu ../include/legacy/*.h)
426-
file(GLOB_RECURSE LOOPS_SOURCES_CUDA ../include/loops/*.cu)
431+
file(GLOB_RECURSE LOOPS_SOURCES_CUDA ../include/loops/*.cu ../include/loops/cuda/**/*.cu)
427432
file(GLOB_RECURSE COMPILATION_UNITS ../include/loops/cuda/compilation_units/*.cu.in ../include/ops/impl/compilation_units/*.cpp.in)
428433
file(GLOB_RECURSE COMPILATION_UNITS ../include/loops/cuda/compilation_units/*.cu.in ../include/loops/cuda/comb_compilation_units/*.cu.in ../include/ops/impl/compilation_units/*.cpp.in)
429434

@@ -485,6 +490,8 @@ if(SD_CUDA)
485490
${CUSTOMOPS_ONEDNN_SOURCES}
486491
${CUSTOMOPS_ARMCOMPUTE_SOURCES}
487492
${CUSTOMOPS_GENERIC_SOURCES}
493+
${LOOPS_SOURCES_CUDA}
494+
488495
)
489496
else()
490497
add_library(samediff_obj OBJECT
@@ -509,6 +516,8 @@ if(SD_CUDA)
509516
${CUSTOMOPS_ONEDNN_SOURCES}
510517
${CUSTOMOPS_ARMCOMPUTE_SOURCES}
511518
${CUSTOMOPS_GENERIC_SOURCES}
519+
${LOOPS_SOURCES_CUDA}
520+
512521
)
513522
endif()
514523

libnd4j/buildnativeoperations.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ PRINT_MATH="OFF"
9494
KEEP_NVCC="OFF"
9595
PREPROCESS="ON" # Initialize PREPROCESS variable
9696
CMAKE_ARGUMENTS=""
97-
97+
PTXAS_INFO="OFF"
9898
while [[ $# -gt 0 ]]
9999
do
100100
key="$1"
@@ -106,6 +106,10 @@ do
106106
CMAKE_ARGUMENTS="$CMAKE_ARGUMENTS -DGENERATE_FLATC=ON"
107107
shift # past argument
108108
;;
109+
-ptxas|--ptxas-info)
110+
PTXAS_INFO="$value"
111+
shift # past argument
112+
;;
109113
-ol|--optimization-level)
110114
OPTIMIZATION_LEVEL="$value"
111115
shift # past argument
@@ -712,6 +716,7 @@ if [ "$LOG_OUTPUT" == "none" ]; then
712716
-DPRINT_INDICES="$PRINT_INDICES" \
713717
-DSD_KEEP_NVCC_OUTPUT="$KEEP_NVCC" \
714718
-DSD_GCC_FUNCTRACE="$FUNC_TRACE" \
719+
-DSD_PTXAS="$PTXAS_INFO" \
715720
"$BLAS_ARG" \
716721
"$ARCH_ARG" \
717722
"$NAME_ARG" \
@@ -739,6 +744,7 @@ else
739744
-DPRINT_INDICES="$PRINT_INDICES" \
740745
-DSD_KEEP_NVCC_OUTPUT="$KEEP_NVCC" \
741746
-DSD_GCC_FUNCTRACE="$FUNC_TRACE" \
747+
-DSD_PTXAS="$PTXAS_INFO" \
742748
"$BLAS_ARG" \
743749
"$ARCH_ARG" \
744750
"$NAME_ARG" \
@@ -773,6 +779,7 @@ if [ "$PREPROCESS" == "ON" ]; then
773779
-DSD_KEEP_NVCC_OUTPUT="$KEEP_NVCC" \
774780
-DSD_GCC_FUNCTRACE="$FUNC_TRACE" \
775781
-DSD_PREPROCESS="$PREPROCESS" \
782+
-DSD_PTXAS="$PTXAS_INFO" \
776783
"$BLAS_ARG" \
777784
"$ARCH_ARG" \
778785
"$NAME_ARG" \
@@ -804,6 +811,7 @@ if [ "$PREPROCESS" == "ON" ]; then
804811
"$ARCH_ARG" \
805812
"$NAME_ARG" \
806813
"$OP_OUTPUT_FILE_ARG" \
814+
-DSD_PTXAS="$PTXAS_INFO" \
807815
-DSD_SANITIZE="${SANITIZE}" \
808816
-DSD_CHECK_VECTORIZATION="${CHECK_VECTORIZATION}" \
809817
"$USE_LTO" \

libnd4j/include/array/NDArray.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2003,7 +2003,7 @@ void * _bufferWithOffset(LongType offset,DataBuffer *buffer) {
20032003
//note this is meant to be used with primary() (host side/cpu) use specialBuffer() for device side buffers
20042004
void *NDArray::buffer() {
20052005
BUILD_SINGLE_SELECTOR(dataType(), return _bufferWithOffset, (offset(),getDataBuffer()),SD_COMMON_TYPES);
2006-
2006+
return nullptr;
20072007
}
20082008

20092009
//////////////////////////////////////////////////////////////////////////

libnd4j/include/array/NDArray.hXX

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2639,7 +2639,7 @@ void NDArray::copyDataForAssign(NDArray& thisArray, NDArray& other, const sd::L
26392639
other.buffer(),
26402640
otherShapeInfo,
26412641
other.specialBuffer(),
2642-
otherShapeInfo,
2642+
other.specialShapeInfo(),
26432643
nullptr
26442644
, allowParallelism);
26452645
thisArray.registerSpecialUse({&thisArray}, {&other}); // Registering through the instance

libnd4j/include/array/TadDescriptor.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
namespace sd {
2929
class SD_LIB_EXPORT TadDescriptor {
3030
private:
31-
ShapeDescriptor _originalShape;
31+
sd::LongType *_originalShape;
3232

3333
std::vector<LongType> _axis;
3434

@@ -37,8 +37,6 @@ class SD_LIB_EXPORT TadDescriptor {
3737
public:
3838
explicit TadDescriptor(const LongType *originalShape, const LongType *dimensions, const LongType length,
3939
const bool keepUnitiesInShape = false);
40-
explicit TadDescriptor(const ShapeDescriptor &descriptor, const std::vector<LongType> &dimensions,
41-
const bool keepUnitiesInShape = false);
4240
~TadDescriptor() = default;
4341

4442

@@ -56,8 +54,7 @@ class SD_LIB_EXPORT TadDescriptor {
5654
bool operator<(const TadDescriptor &other) const;
5755

5856
std::vector<LongType> &axis();
59-
ShapeDescriptor &originalShape();
60-
ShapeDescriptor const &originalShapeConst() const;
57+
LongType *originalShape();
6158
bool areUnitiesinShape() const;
6259
};
6360
} // namespace sd

libnd4j/include/array/cuda/CudaPointerDeallocator.cu

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,23 @@
2525

2626
namespace sd {
2727

28-
void CudaPointerDeallocator::release(void *ptr) { cudaFree(ptr); }
28+
void CudaPointerDeallocator::release(void *ptr) {
29+
if (ptr == nullptr) return;
2930

31+
// Check if this is a valid device pointer before freeing
32+
cudaPointerAttributes attributes;
33+
cudaError_t result = cudaPointerGetAttributes(&attributes, ptr);
34+
35+
if (result == cudaSuccess) {
36+
// Only free if it's a regular device pointer
37+
// cudaMemoryTypeDevice is for regular allocations we can free
38+
if (attributes.type == cudaMemoryTypeDevice) {
39+
cudaFree(ptr);
40+
}
41+
// Don't free other types (like constant memory)
42+
} else {
43+
// Clear the error and don't try to free this pointer
44+
cudaGetLastError(); // Clear the error state
45+
}
46+
}
3047
} // namespace sd

libnd4j/include/array/impl/ExtraArguments.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ void ExtraArguments::convertAndCopy(Pointer pointer, LongType offset) {
7979
#ifdef __CUDABLAS__
8080
// TODO: maybe make it asynchronous eventually?
8181
cudaMemcpy(pointer, target, length * DataTypeUtils::sizeOf(DataTypeUtils::fromT<T>()), cudaMemcpyHostToDevice);
82-
delete target;
82+
delete[] target;
8383
#endif
8484
}
8585
BUILD_SINGLE_TEMPLATE(template SD_LIB_EXPORT void ExtraArguments::convertAndCopy,

libnd4j/include/array/impl/TadDescriptor.cpp

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ TadDescriptor::TadDescriptor(const TadDescriptor &other) {
3333

3434
TadDescriptor::TadDescriptor(const LongType *originalShape, const LongType *dimensions, const LongType length,
3535
const bool keepUnitiesInShape) {
36-
ShapeDescriptor *descriptor = new ShapeDescriptor(originalShape, false);
3736

3837
_axis.resize(length);
3938
for (LongType e = 0; e < length; e++) {
@@ -42,19 +41,10 @@ TadDescriptor::TadDescriptor(const LongType *originalShape, const LongType *dime
4241

4342
if (length > 1) std::sort(_axis.begin(), _axis.end());
4443

45-
_originalShape = *descriptor;
44+
_originalShape = const_cast<sd::LongType *>(originalShape);
4645
_unitiesInShape = keepUnitiesInShape;
4746
}
4847

49-
TadDescriptor::TadDescriptor(const ShapeDescriptor &descriptor, const std::vector<LongType> &dimensions,
50-
const bool keepUnitiesInShape) {
51-
_originalShape = descriptor;
52-
_axis = dimensions;
53-
_unitiesInShape = keepUnitiesInShape;
54-
55-
if (_axis.size() > 1) std::sort(_axis.begin(), _axis.end());
56-
}
57-
5848
bool TadDescriptor::operator==(const TadDescriptor &other) const {
5949
return std::tie(_originalShape, _axis, _unitiesInShape) ==
6050
std::tie(other._originalShape, other._axis, other._unitiesInShape);
@@ -67,9 +57,7 @@ bool TadDescriptor::operator<(const TadDescriptor &other) const {
6757

6858
std::vector<LongType> &TadDescriptor::axis() { return _axis; }
6959

70-
ShapeDescriptor &TadDescriptor::originalShape() { return _originalShape; }
71-
72-
ShapeDescriptor const &TadDescriptor::originalShapeConst() const { return _originalShape; }
60+
LongType *TadDescriptor::originalShape() { return _originalShape; }
7361

7462
bool TadDescriptor::areUnitiesinShape() const { return _unitiesInShape; }
7563
} // namespace sd
@@ -80,13 +68,6 @@ size_t hash<sd::TadDescriptor>::operator()(const sd::TadDescriptor &k) const {
8068

8169
// Start with initial hash from unities flag
8270
uint64_t hash = ModularHasher::hash_scalar(k.areUnitiesinShape());
83-
84-
// Combine with original shape hash
85-
hash = ModularHasher::combine_hashes({
86-
hash,
87-
std::hash<sd::ShapeDescriptor>()(k.originalShapeConst())
88-
});
89-
9071
// Hash the axis vector
9172
auto& axes = const_cast<sd::TadDescriptor&>(k).axis();
9273
if (!axes.empty()) {

libnd4j/include/execution/impl/Threads.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ Span2 Span2::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t sta
146146
default:
147147
THROW_EXCEPTION("");
148148
}
149+
150+
return Span2(startX, stopX, incX, 0, 0, incY);
151+
149152
}
150153

151154
int64_t Span::startX() const {

0 commit comments

Comments
 (0)