|
24 | 24 |
|
25 | 25 | JNIEXPORT jobjectArray JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_doALSWithCSR
|
26 | 26 | (JNIEnv * env, jobject obj, jint m, jint n, jint f, jint nnz, jdouble lambda, jobjectArray sortedSrcFactors, jintArray csrRow, jintArray csrCol, jfloatArray csrVal){
|
27 |
| - //checkCudaErrors(cudaSetDevice(1)); |
28 |
| - //use multiple GPUs |
29 |
| - //select a GPU for *this* specific dataset |
30 |
| - int whichGPU = get_gpu(); |
31 |
| - checkCudaErrors(cudaSetDevice(whichGPU)); |
32 |
| - cudaStream_t cuda_stream; |
33 |
| - cudaStreamCreate(&cuda_stream); |
34 |
| - /* check correctness |
35 |
| - int csrRowlen = env->GetArrayLength(csrRow); |
36 |
| - int csrCollen = env->GetArrayLength(csrCol); |
37 |
| - int csrVallen = env->GetArrayLength(csrVal); |
38 |
| - assert(csrRowlen == m + 1); |
39 |
| - assert(csrCollen == nnz); |
40 |
| - assert(csrVallen == nnz); |
41 |
| - */ |
42 |
| - int* csrRowIndexHostPtr; |
43 |
| - int* csrColIndexHostPtr; |
44 |
| - float* csrValHostPtr; |
45 |
| - /* |
46 |
| - printf("csrRow of len %d: ", len); |
47 |
| - for (int i = 0; i < len; i++) { |
48 |
| - printf("%d ", body[i]); |
49 |
| - } |
50 |
| - printf("\n"); |
51 |
| - */ |
52 |
| - //calculate X from thetaT |
53 |
| - float* thetaTHost; |
54 |
| - cudacall(cudaMallocHost( (void** ) &thetaTHost, n * f * sizeof(thetaTHost[0])) ); |
55 |
| - //to be returned |
56 |
| - float* XTHost; |
57 |
| - cudacall(cudaMallocHost( (void** ) &XTHost, m * f * sizeof(XTHost[0])) ); |
58 |
| - |
59 |
| - int numSrcBlocks = env->GetArrayLength(sortedSrcFactors); |
60 |
| - //WARNING: ReleaseFloatArrayElements and DeleteLocalRef are important; |
61 |
| - //Otherwise result is correct but performance is bad |
62 |
| - int index = 0; |
63 |
| - for(int i = 0; i < numSrcBlocks; i++){ |
64 |
| - jobject factorsPerBlock = env->GetObjectArrayElement(sortedSrcFactors, i); |
65 |
| - int numFactors = env->GetArrayLength((jobjectArray)factorsPerBlock); |
66 |
| - for(int j = 0; j < numFactors; j++){ |
67 |
| - jobject factor = env->GetObjectArrayElement((jobjectArray)factorsPerBlock, j); |
68 |
| - jfloat *factorfloat = (jfloat *) env->GetPrimitiveArrayCritical( (jfloatArray)factor, 0); |
69 |
| - memcpy(thetaTHost + index*f, factorfloat, sizeof(float)*f); |
70 |
| - index ++; |
71 |
| - env->ReleasePrimitiveArrayCritical((jfloatArray)factor, factorfloat, 0); |
72 |
| - env->DeleteLocalRef(factor); |
| 27 | + try{ |
| 28 | + //checkCudaErrors(cudaSetDevice(1)); |
| 29 | + //use multiple GPUs |
| 30 | + //select a GPU for *this* specific dataset |
| 31 | + int whichGPU = get_gpu(); |
| 32 | + checkCudaErrors(cudaSetDevice(whichGPU)); |
| 33 | + cudaStream_t cuda_stream; |
| 34 | + cudaStreamCreate(&cuda_stream); |
| 35 | + /* check correctness |
| 36 | + int csrRowlen = env->GetArrayLength(csrRow); |
| 37 | + int csrCollen = env->GetArrayLength(csrCol); |
| 38 | + int csrVallen = env->GetArrayLength(csrVal); |
| 39 | + assert(csrRowlen == m + 1); |
| 40 | + assert(csrCollen == nnz); |
| 41 | + assert(csrVallen == nnz); |
| 42 | + */ |
| 43 | + int* csrRowIndexHostPtr; |
| 44 | + int* csrColIndexHostPtr; |
| 45 | + float* csrValHostPtr; |
| 46 | + /* |
| 47 | + printf("csrRow of len %d: ", len); |
| 48 | + for (int i = 0; i < len; i++) { |
| 49 | + printf("%d ", body[i]); |
73 | 50 | }
|
74 |
| - env->DeleteLocalRef(factorsPerBlock); |
75 |
| - } |
76 |
| - // get a pointer to the raw input data, pinning them in memory |
77 |
| - csrRowIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrRow, 0); |
78 |
| - csrColIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrCol, 0); |
79 |
| - csrValHostPtr = (jfloat*) env->GetPrimitiveArrayCritical(csrVal, 0); |
| 51 | + printf("\n"); |
| 52 | + */ |
| 53 | + //calculate X from thetaT |
| 54 | + float* thetaTHost; |
| 55 | + cudacall(cudaMallocHost( (void** ) &thetaTHost, n * f * sizeof(thetaTHost[0])) ); |
| 56 | + //to be returned |
| 57 | + float* XTHost; |
| 58 | + cudacall(cudaMallocHost( (void** ) &XTHost, m * f * sizeof(XTHost[0])) ); |
| 59 | + |
| 60 | + int numSrcBlocks = env->GetArrayLength(sortedSrcFactors); |
| 61 | + //WARNING: ReleaseFloatArrayElements and DeleteLocalRef are important; |
| 62 | + //Otherwise result is correct but performance is bad |
| 63 | + int index = 0; |
| 64 | + for(int i = 0; i < numSrcBlocks; i++){ |
| 65 | + jobject factorsPerBlock = env->GetObjectArrayElement(sortedSrcFactors, i); |
| 66 | + int numFactors = env->GetArrayLength((jobjectArray)factorsPerBlock); |
| 67 | + for(int j = 0; j < numFactors; j++){ |
| 68 | + jobject factor = env->GetObjectArrayElement((jobjectArray)factorsPerBlock, j); |
| 69 | + jfloat *factorfloat = (jfloat *) env->GetPrimitiveArrayCritical( (jfloatArray)factor, 0); |
| 70 | + memcpy(thetaTHost + index*f, factorfloat, sizeof(float)*f); |
| 71 | + index ++; |
| 72 | + env->ReleasePrimitiveArrayCritical((jfloatArray)factor, factorfloat, 0); |
| 73 | + env->DeleteLocalRef(factor); |
| 74 | + } |
| 75 | + env->DeleteLocalRef(factorsPerBlock); |
| 76 | + } |
| 77 | + // get a pointer to the raw input data, pinning them in memory |
| 78 | + csrRowIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrRow, 0); |
| 79 | + csrColIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrCol, 0); |
| 80 | + csrValHostPtr = (jfloat*) env->GetPrimitiveArrayCritical(csrVal, 0); |
80 | 81 |
|
81 |
| - /* |
82 |
| - printf("thetaTHost of len %d: \n", n*f); |
83 |
| - for (int i = 0; i < n*f; i++) { |
84 |
| - printf("%f ", thetaTHost[i]); |
85 |
| - } |
86 |
| - printf("\n"); |
87 |
| - */ |
88 |
| - int * d_csrRowIndex = 0; |
89 |
| - int * d_csrColIndex = 0; |
90 |
| - float * d_csrVal = 0; |
| 82 | + /* |
| 83 | + printf("thetaTHost of len %d: \n", n*f); |
| 84 | + for (int i = 0; i < n*f; i++) { |
| 85 | + printf("%f ", thetaTHost[i]); |
| 86 | + } |
| 87 | + printf("\n"); |
| 88 | + */ |
| 89 | + int * d_csrRowIndex = 0; |
| 90 | + int * d_csrColIndex = 0; |
| 91 | + float * d_csrVal = 0; |
91 | 92 |
|
92 |
| - cudacall(cudaMalloc((void** ) &d_csrRowIndex,(m + 1) * sizeof(float))); |
93 |
| - cudacall(cudaMalloc((void** ) &d_csrColIndex, nnz * sizeof(float))); |
94 |
| - cudacall(cudaMalloc((void** ) &d_csrVal, nnz * sizeof(float))); |
95 |
| - cudacall(cudaMemcpyAsync(d_csrRowIndex, csrRowIndexHostPtr,(size_t ) ((m + 1) * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream)); |
96 |
| - cudacall(cudaMemcpyAsync(d_csrColIndex, csrColIndexHostPtr,(size_t ) (nnz * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream)); |
97 |
| - cudacall(cudaMemcpyAsync(d_csrVal, csrValHostPtr,(size_t ) (nnz * sizeof(float)),cudaMemcpyHostToDevice, cuda_stream)); |
98 |
| - cudaStreamSynchronize(cuda_stream); |
| 93 | + cudacall(mallocBest((void** ) &d_csrRowIndex,(m + 1) * sizeof(float))); |
| 94 | + cudacall(mallocBest((void** ) &d_csrColIndex, nnz * sizeof(float))); |
| 95 | + cudacall(mallocBest((void** ) &d_csrVal, nnz * sizeof(float))); |
| 96 | + cudacall(cudaMemcpyAsync(d_csrRowIndex, csrRowIndexHostPtr,(size_t ) ((m + 1) * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream)); |
| 97 | + cudacall(cudaMemcpyAsync(d_csrColIndex, csrColIndexHostPtr,(size_t ) (nnz * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream)); |
| 98 | + cudacall(cudaMemcpyAsync(d_csrVal, csrValHostPtr,(size_t ) (nnz * sizeof(float)),cudaMemcpyHostToDevice, cuda_stream)); |
| 99 | + cudaStreamSynchronize(cuda_stream); |
99 | 100 |
|
100 |
| - // un-pin the host arrays, as we're done with them |
101 |
| - env->ReleasePrimitiveArrayCritical(csrRow, csrRowIndexHostPtr, 0); |
102 |
| - env->ReleasePrimitiveArrayCritical(csrCol, csrColIndexHostPtr, 0); |
103 |
| - env->ReleasePrimitiveArrayCritical(csrVal, csrValHostPtr, 0); |
| 101 | + // un-pin the host arrays, as we're done with them |
| 102 | + env->ReleasePrimitiveArrayCritical(csrRow, csrRowIndexHostPtr, 0); |
| 103 | + env->ReleasePrimitiveArrayCritical(csrCol, csrColIndexHostPtr, 0); |
| 104 | + env->ReleasePrimitiveArrayCritical(csrVal, csrValHostPtr, 0); |
104 | 105 |
|
105 |
| - printf("\tdoALSWithCSR with m=%d,n=%d,f=%d,nnz=%d,lambda=%f \n.", m, n, f, nnz, lambda); |
106 |
| - try{ |
| 106 | + #ifdef DEBUG |
| 107 | + printf("\tdoALSWithCSR with m=%d,n=%d,f=%d,nnz=%d,lambda=%f \n.", m, n, f, nnz, lambda); |
| 108 | + #endif |
107 | 109 | doALSWithCSR(cuda_stream, d_csrRowIndex, d_csrColIndex, d_csrVal, thetaTHost, XTHost, m, n, f, nnz, lambda, 1);
|
108 |
| - } |
109 |
| - catch (thrust::system_error &e) { |
110 |
| - printf("CUDA error during some_function: %s", e.what()); |
111 | 110 |
|
112 |
| - } |
113 |
| - jclass floatArrayClass = env->FindClass("[F"); |
114 |
| - jobjectArray output = env->NewObjectArray(m, floatArrayClass, 0); |
115 |
| - for (int i = 0; i < m; i++) { |
116 |
| - jfloatArray col = env->NewFloatArray(f); |
117 |
| - env->SetFloatArrayRegion(col, 0, f, XTHost + i*f); |
118 |
| - env->SetObjectArrayElement(output, i, col); |
119 |
| - env->DeleteLocalRef(col); |
120 |
| - } |
121 |
| - cudaFreeHost(thetaTHost); |
122 |
| - cudaFreeHost(XTHost); |
123 |
| - //TODO: stream create and destroy expensive? |
124 |
| - checkCudaErrors(cudaStreamSynchronize(cuda_stream)); |
125 |
| - checkCudaErrors(cudaStreamDestroy(cuda_stream)); |
126 |
| - cudaCheckError(); |
127 |
| - return output; |
| 111 | + jclass floatArrayClass = env->FindClass("[F"); |
| 112 | + jobjectArray output = env->NewObjectArray(m, floatArrayClass, 0); |
| 113 | + for (int i = 0; i < m; i++) { |
| 114 | + jfloatArray col = env->NewFloatArray(f); |
| 115 | + env->SetFloatArrayRegion(col, 0, f, XTHost + i*f); |
| 116 | + env->SetObjectArrayElement(output, i, col); |
| 117 | + env->DeleteLocalRef(col); |
| 118 | + } |
| 119 | + cudaFreeHost(thetaTHost); |
| 120 | + cudaFreeHost(XTHost); |
| 121 | + //TODO: stream create and destroy expensive? |
| 122 | + checkCudaErrors(cudaStreamSynchronize(cuda_stream)); |
| 123 | + checkCudaErrors(cudaStreamDestroy(cuda_stream)); |
| 124 | + #ifdef DEBUG |
| 125 | + cudaCheckError(); |
| 126 | + #endif |
| 127 | + return output; |
| 128 | + } |
| 129 | + catch (thrust::system_error &e) { |
| 130 | + printf("CUDA error during some function: %s", e.what()); |
| 131 | + } |
128 | 132 | }
|
129 | 133 |
|
130 | 134 | JNIEXPORT void JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_testjni
|
|
0 commit comments