Skip to content

Commit 2f6cb87

Browse files
committed
initial commit
1 parent 26f330e commit 2f6cb87

File tree

7 files changed

+2156
-0
lines changed

7 files changed

+2156
-0
lines changed

als/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# CuMF: CUDA-Acclerated ALS on mulitple GPUs.
2+
3+
This folder contains:
4+
(1) the CUDA kernel code implementing ALS (alternating least square), and
5+
(2) the JNI code to link to and accelerate the ALS.scala program in Spark MLlib.
6+
7+
## Technical details
8+
By optimizing memory access and parallelism, cuMF is much faster and cost-efficient compared with state-of-art CPU based solutions.
9+
10+
More details can be found at:
11+
1) This Nvidia GTC 2016 talk
12+
ppt:
13+
<http://www.slideshare.net/tanwei/s6211-cumf-largescale-matrix-factorization-on-just-one-machine-with-gpus>
14+
video:
15+
<http://on-demand.gputechconf.com/gtc/2016/video/S6211.html>
16+
17+
2) This HPDC 2016 paper: "Faster and Cheaper: Parallelizing Large-Scale Matrix Factorization on GPUs"
18+
<http://arxiv.org/abs/1603.03820>
19+
20+
## Build
21+
There are scripts to build the program locally, run in local mode, and run in distributed mode.
22+
You should modify the first line or two to point to your own instalation of Java, Spark, Scala and data files.
23+
24+
To build, first set $CUDA_ROOT to your cuda installation (e.g., /usr/local/cuda) and $JAVA_HOME to your JDK (not JRE, we need JDK to build the jni code).
25+
26+
Then run:
27+
28+
build.sh
29+
30+
31+
## Run
32+
33+
To run, first set $SPARK_HOME and $JAVA_HOME.
34+
35+
To submit to a local Spark installation:
36+
Run runLocal.sh, specifying the mode (gpu or cpu), #cores, the lambda, and the rank. Prepare a data file and put its name after "--kryo" in the runLocal.sh script.
37+
38+
Note: rank value has to be a multiply of 10, e.g., 10, 50, 100, 200). For example:
39+
40+
./runLocal.sh gpu 12 0.058 100

als/build.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#build the jni cpp code and cuda code for ALS
2+
#!/bin/bash
3+
#unamestr=`uname -m`
4+
if [ -z ${JAVA_HOME} ]; then
5+
echo "Please set JAVA_HOME!"
6+
exit 1
7+
else
8+
echo "use existing JAVA_HOME " $JAVA_HOME
9+
fi
10+
11+
if [ -z ${CUDA_ROOT} ]; then
12+
echo "Please set CUDA_ROOT to the cuda installation, say /usr/local/cuda !"
13+
exit 1
14+
else
15+
echo "use existing CUDA_ROOT " $CUDA_ROOT
16+
fi
17+
18+
echo "compile the cuda & native code"
19+
$CUDA_ROOT/bin/nvcc -shared -D_USE_GPU_ -I/usr/include -I$JAVA_HOME/include -I$JAVA_HOME/include/linux ../utilities.cu src/cuda/als.cu src/CuMFJNIInterface.cpp -o libGPUALS.so -Xcompiler "-fPIC" -m64 -use_fast_math -rdc=true -gencode arch=compute_35,code=sm_35 -gencode arch=compute_35,code=compute_35 -O3 -Xptxas -dlcm=ca -L{$CUDA_ROOT}/lib64 -lcublas -lcusparse
20+
21+
#echo "build spark"
22+
#SPARK_HOME=../../Spark-MLlib/
23+
#cd $SPARK_HOME
24+
#build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
25+
26+
#echo "build spark distribution"
27+
#cd $SPARK_HOME
28+
#./dev/make-distribution.sh -Pnetlib-lgpl -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.2

als/src/CuMFJNIInterface.cpp

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
#include "org_apache_spark_ml_recommendation_CuMFJNIInterface.h"
18+
#include "cuda/als.h"
19+
#include <assert.h>
20+
#include <string.h>
21+
#include <stdio.h>
22+
#include <stdlib.h>
23+
#include "../../utilities.h"
24+
25+
JNIEXPORT jobjectArray JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_doALSWithCSR
26+
(JNIEnv * env, jobject obj, jint m, jint n, jint f, jint nnz, jdouble lambda, jobjectArray sortedSrcFactors, jintArray csrRow, jintArray csrCol, jfloatArray csrVal){
27+
//checkCudaErrors(cudaSetDevice(1));
28+
//use multiple GPUs
29+
//select a GPU for *this* specific dataset
30+
int whichGPU = get_gpu();
31+
checkCudaErrors(cudaSetDevice(whichGPU));
32+
cudaStream_t cuda_stream;
33+
cudaStreamCreate(&cuda_stream);
34+
/* check correctness
35+
int csrRowlen = env->GetArrayLength(csrRow);
36+
int csrCollen = env->GetArrayLength(csrCol);
37+
int csrVallen = env->GetArrayLength(csrVal);
38+
assert(csrRowlen == m + 1);
39+
assert(csrCollen == nnz);
40+
assert(csrVallen == nnz);
41+
*/
42+
int* csrRowIndexHostPtr;
43+
int* csrColIndexHostPtr;
44+
float* csrValHostPtr;
45+
/*
46+
printf("csrRow of len %d: ", len);
47+
for (int i = 0; i < len; i++) {
48+
printf("%d ", body[i]);
49+
}
50+
printf("\n");
51+
*/
52+
//calculate X from thetaT
53+
float* thetaTHost;
54+
cudacall(cudaMallocHost( (void** ) &thetaTHost, n * f * sizeof(thetaTHost[0])) );
55+
//to be returned
56+
float* XTHost;
57+
cudacall(cudaMallocHost( (void** ) &XTHost, m * f * sizeof(XTHost[0])) );
58+
59+
int numSrcBlocks = env->GetArrayLength(sortedSrcFactors);
60+
//WARNING: ReleaseFloatArrayElements and DeleteLocalRef are important;
61+
//Otherwise result is correct but performance is bad
62+
int index = 0;
63+
for(int i = 0; i < numSrcBlocks; i++){
64+
jobject factorsPerBlock = env->GetObjectArrayElement(sortedSrcFactors, i);
65+
int numFactors = env->GetArrayLength((jobjectArray)factorsPerBlock);
66+
for(int j = 0; j < numFactors; j++){
67+
jobject factor = env->GetObjectArrayElement((jobjectArray)factorsPerBlock, j);
68+
jfloat *factorfloat = (jfloat *) env->GetPrimitiveArrayCritical( (jfloatArray)factor, 0);
69+
memcpy(thetaTHost + index*f, factorfloat, sizeof(float)*f);
70+
index ++;
71+
env->ReleasePrimitiveArrayCritical((jfloatArray)factor, factorfloat, 0);
72+
env->DeleteLocalRef(factor);
73+
}
74+
env->DeleteLocalRef(factorsPerBlock);
75+
}
76+
// get a pointer to the raw input data, pinning them in memory
77+
csrRowIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrRow, 0);
78+
csrColIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrCol, 0);
79+
csrValHostPtr = (jfloat*) env->GetPrimitiveArrayCritical(csrVal, 0);
80+
81+
/*
82+
printf("thetaTHost of len %d: \n", n*f);
83+
for (int i = 0; i < n*f; i++) {
84+
printf("%f ", thetaTHost[i]);
85+
}
86+
printf("\n");
87+
*/
88+
int * d_csrRowIndex = 0;
89+
int * d_csrColIndex = 0;
90+
float * d_csrVal = 0;
91+
92+
cudacall(cudaMalloc((void** ) &d_csrRowIndex,(m + 1) * sizeof(float)));
93+
cudacall(cudaMalloc((void** ) &d_csrColIndex, nnz * sizeof(float)));
94+
cudacall(cudaMalloc((void** ) &d_csrVal, nnz * sizeof(float)));
95+
cudacall(cudaMemcpyAsync(d_csrRowIndex, csrRowIndexHostPtr,(size_t ) ((m + 1) * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream));
96+
cudacall(cudaMemcpyAsync(d_csrColIndex, csrColIndexHostPtr,(size_t ) (nnz * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream));
97+
cudacall(cudaMemcpyAsync(d_csrVal, csrValHostPtr,(size_t ) (nnz * sizeof(float)),cudaMemcpyHostToDevice, cuda_stream));
98+
cudaStreamSynchronize(cuda_stream);
99+
100+
// un-pin the host arrays, as we're done with them
101+
env->ReleasePrimitiveArrayCritical(csrRow, csrRowIndexHostPtr, 0);
102+
env->ReleasePrimitiveArrayCritical(csrCol, csrColIndexHostPtr, 0);
103+
env->ReleasePrimitiveArrayCritical(csrVal, csrValHostPtr, 0);
104+
105+
printf("\tdoALSWithCSR with m=%d,n=%d,f=%d,nnz=%d,lambda=%f \n.", m, n, f, nnz, lambda);
106+
try{
107+
doALSWithCSR(cuda_stream, d_csrRowIndex, d_csrColIndex, d_csrVal, thetaTHost, XTHost, m, n, f, nnz, lambda, 1);
108+
}
109+
catch (thrust::system_error &e) {
110+
printf("CUDA error during some_function: %s", e.what());
111+
112+
}
113+
jclass floatArrayClass = env->FindClass("[F");
114+
jobjectArray output = env->NewObjectArray(m, floatArrayClass, 0);
115+
for (int i = 0; i < m; i++) {
116+
jfloatArray col = env->NewFloatArray(f);
117+
env->SetFloatArrayRegion(col, 0, f, XTHost + i*f);
118+
env->SetObjectArrayElement(output, i, col);
119+
env->DeleteLocalRef(col);
120+
}
121+
cudaFreeHost(thetaTHost);
122+
cudaFreeHost(XTHost);
123+
//TODO: stream create and destroy expensive?
124+
checkCudaErrors(cudaStreamSynchronize(cuda_stream));
125+
checkCudaErrors(cudaStreamDestroy(cuda_stream));
126+
cudaCheckError();
127+
return output;
128+
}
129+
130+
JNIEXPORT void JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_testjni
131+
(JNIEnv * env, jobject obj){
132+
printf("*******in native code of testjni ...\n");
133+
134+
}

0 commit comments

Comments
 (0)