|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | +#include "org_apache_spark_ml_recommendation_CuMFJNIInterface.h" |
| 18 | +#include "cuda/als.h" |
| 19 | +#include <assert.h> |
| 20 | +#include <string.h> |
| 21 | +#include <stdio.h> |
| 22 | +#include <stdlib.h> |
| 23 | +#include "../../utilities.h" |
| 24 | + |
| 25 | +JNIEXPORT jobjectArray JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_doALSWithCSR |
| 26 | + (JNIEnv * env, jobject obj, jint m, jint n, jint f, jint nnz, jdouble lambda, jobjectArray sortedSrcFactors, jintArray csrRow, jintArray csrCol, jfloatArray csrVal){ |
| 27 | + //checkCudaErrors(cudaSetDevice(1)); |
| 28 | + //use multiple GPUs |
| 29 | + //select a GPU for *this* specific dataset |
| 30 | + int whichGPU = get_gpu(); |
| 31 | + checkCudaErrors(cudaSetDevice(whichGPU)); |
| 32 | + cudaStream_t cuda_stream; |
| 33 | + cudaStreamCreate(&cuda_stream); |
| 34 | + /* check correctness |
| 35 | + int csrRowlen = env->GetArrayLength(csrRow); |
| 36 | + int csrCollen = env->GetArrayLength(csrCol); |
| 37 | + int csrVallen = env->GetArrayLength(csrVal); |
| 38 | + assert(csrRowlen == m + 1); |
| 39 | + assert(csrCollen == nnz); |
| 40 | + assert(csrVallen == nnz); |
| 41 | + */ |
| 42 | + int* csrRowIndexHostPtr; |
| 43 | + int* csrColIndexHostPtr; |
| 44 | + float* csrValHostPtr; |
| 45 | + /* |
| 46 | + printf("csrRow of len %d: ", len); |
| 47 | + for (int i = 0; i < len; i++) { |
| 48 | + printf("%d ", body[i]); |
| 49 | + } |
| 50 | + printf("\n"); |
| 51 | + */ |
| 52 | + //calculate X from thetaT |
| 53 | + float* thetaTHost; |
| 54 | + cudacall(cudaMallocHost( (void** ) &thetaTHost, n * f * sizeof(thetaTHost[0])) ); |
| 55 | + //to be returned |
| 56 | + float* XTHost; |
| 57 | + cudacall(cudaMallocHost( (void** ) &XTHost, m * f * sizeof(XTHost[0])) ); |
| 58 | + |
| 59 | + int numSrcBlocks = env->GetArrayLength(sortedSrcFactors); |
| 60 | + //WARNING: ReleaseFloatArrayElements and DeleteLocalRef are important; |
| 61 | + //Otherwise result is correct but performance is bad |
| 62 | + int index = 0; |
| 63 | + for(int i = 0; i < numSrcBlocks; i++){ |
| 64 | + jobject factorsPerBlock = env->GetObjectArrayElement(sortedSrcFactors, i); |
| 65 | + int numFactors = env->GetArrayLength((jobjectArray)factorsPerBlock); |
| 66 | + for(int j = 0; j < numFactors; j++){ |
| 67 | + jobject factor = env->GetObjectArrayElement((jobjectArray)factorsPerBlock, j); |
| 68 | + jfloat *factorfloat = (jfloat *) env->GetPrimitiveArrayCritical( (jfloatArray)factor, 0); |
| 69 | + memcpy(thetaTHost + index*f, factorfloat, sizeof(float)*f); |
| 70 | + index ++; |
| 71 | + env->ReleasePrimitiveArrayCritical((jfloatArray)factor, factorfloat, 0); |
| 72 | + env->DeleteLocalRef(factor); |
| 73 | + } |
| 74 | + env->DeleteLocalRef(factorsPerBlock); |
| 75 | + } |
| 76 | + // get a pointer to the raw input data, pinning them in memory |
| 77 | + csrRowIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrRow, 0); |
| 78 | + csrColIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrCol, 0); |
| 79 | + csrValHostPtr = (jfloat*) env->GetPrimitiveArrayCritical(csrVal, 0); |
| 80 | + |
| 81 | + /* |
| 82 | + printf("thetaTHost of len %d: \n", n*f); |
| 83 | + for (int i = 0; i < n*f; i++) { |
| 84 | + printf("%f ", thetaTHost[i]); |
| 85 | + } |
| 86 | + printf("\n"); |
| 87 | + */ |
| 88 | + int * d_csrRowIndex = 0; |
| 89 | + int * d_csrColIndex = 0; |
| 90 | + float * d_csrVal = 0; |
| 91 | + |
| 92 | + cudacall(cudaMalloc((void** ) &d_csrRowIndex,(m + 1) * sizeof(float))); |
| 93 | + cudacall(cudaMalloc((void** ) &d_csrColIndex, nnz * sizeof(float))); |
| 94 | + cudacall(cudaMalloc((void** ) &d_csrVal, nnz * sizeof(float))); |
| 95 | + cudacall(cudaMemcpyAsync(d_csrRowIndex, csrRowIndexHostPtr,(size_t ) ((m + 1) * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream)); |
| 96 | + cudacall(cudaMemcpyAsync(d_csrColIndex, csrColIndexHostPtr,(size_t ) (nnz * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream)); |
| 97 | + cudacall(cudaMemcpyAsync(d_csrVal, csrValHostPtr,(size_t ) (nnz * sizeof(float)),cudaMemcpyHostToDevice, cuda_stream)); |
| 98 | + cudaStreamSynchronize(cuda_stream); |
| 99 | + |
| 100 | + // un-pin the host arrays, as we're done with them |
| 101 | + env->ReleasePrimitiveArrayCritical(csrRow, csrRowIndexHostPtr, 0); |
| 102 | + env->ReleasePrimitiveArrayCritical(csrCol, csrColIndexHostPtr, 0); |
| 103 | + env->ReleasePrimitiveArrayCritical(csrVal, csrValHostPtr, 0); |
| 104 | + |
| 105 | + printf("\tdoALSWithCSR with m=%d,n=%d,f=%d,nnz=%d,lambda=%f \n.", m, n, f, nnz, lambda); |
| 106 | + try{ |
| 107 | + doALSWithCSR(cuda_stream, d_csrRowIndex, d_csrColIndex, d_csrVal, thetaTHost, XTHost, m, n, f, nnz, lambda, 1); |
| 108 | + } |
| 109 | + catch (thrust::system_error &e) { |
| 110 | + printf("CUDA error during some_function: %s", e.what()); |
| 111 | + |
| 112 | + } |
| 113 | + jclass floatArrayClass = env->FindClass("[F"); |
| 114 | + jobjectArray output = env->NewObjectArray(m, floatArrayClass, 0); |
| 115 | + for (int i = 0; i < m; i++) { |
| 116 | + jfloatArray col = env->NewFloatArray(f); |
| 117 | + env->SetFloatArrayRegion(col, 0, f, XTHost + i*f); |
| 118 | + env->SetObjectArrayElement(output, i, col); |
| 119 | + env->DeleteLocalRef(col); |
| 120 | + } |
| 121 | + cudaFreeHost(thetaTHost); |
| 122 | + cudaFreeHost(XTHost); |
| 123 | + //TODO: stream create and destroy expensive? |
| 124 | + checkCudaErrors(cudaStreamSynchronize(cuda_stream)); |
| 125 | + checkCudaErrors(cudaStreamDestroy(cuda_stream)); |
| 126 | + cudaCheckError(); |
| 127 | + return output; |
| 128 | + } |
| 129 | + |
| 130 | + JNIEXPORT void JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_testjni |
| 131 | + (JNIEnv * env, jobject obj){ |
| 132 | + printf("*******in native code of testjni ...\n"); |
| 133 | + |
| 134 | + } |
0 commit comments