IBMSparkGPU
diff --git a/‎als/README.md
Lines changed: 40 additions & 0 deletions b/‎als/README.md
Lines changed: 40 additions & 0 deletions
diff --git a/‎als/build.sh
Lines changed: 28 additions & 0 deletions b/‎als/build.sh
Lines changed: 28 additions & 0 deletions
diff --git a/‎als/src/CuMFJNIInterface.cpp
Lines changed: 134 additions & 0 deletions b/‎als/src/CuMFJNIInterface.cpp
Lines changed: 134 additions & 0 deletions
@@ -0,0 +1,40 @@
+# CuMF: CUDA-Acclerated ALS on mulitple GPUs. 
+
+This folder contains:
+(1) the CUDA kernel code implementing ALS (alternating least square), and 
+(2) the JNI code to link to and accelerate the ALS.scala program in Spark MLlib.
+
+## Technical details
+By optimizing memory access and parallelism, cuMF is much faster and cost-efficient compared with state-of-art CPU based solutions. 
+
+More details can be found at:
+1) This Nvidia GTC 2016 talk
+ppt:
+<http://www.slideshare.net/tanwei/s6211-cumf-largescale-matrix-factorization-on-just-one-machine-with-gpus>
+video:
+<http://on-demand.gputechconf.com/gtc/2016/video/S6211.html>
+
+2) This HPDC 2016 paper: "Faster and Cheaper: Parallelizing Large-Scale Matrix Factorization on GPUs"
+<http://arxiv.org/abs/1603.03820>
+
+## Build
+There are scripts to build the program locally, run in local mode, and run in distributed mode.  
+You should modify the first line or two to point to your own instalation of Java, Spark, Scala and data files.
+
+To build, first set $CUDA_ROOT to your cuda installation (e.g., /usr/local/cuda) and $JAVA_HOME to your JDK (not JRE, we need JDK to build the jni code).
+
+Then run:
+
+	build.sh
+
+
+## Run
+
+To run, first set $SPARK_HOME and $JAVA_HOME. 
+
+To submit to a local Spark installation:
+Run runLocal.sh, specifying the mode (gpu or cpu), #cores, the lambda, and the rank. Prepare a data file and put its name after "--kryo" in the runLocal.sh script.
+
+Note: rank value has to be a multiply of 10, e.g., 10, 50, 100, 200). For example:
+
+	./runLocal.sh gpu 12 0.058 100
@@ -0,0 +1,28 @@
+#build the jni cpp code and cuda code for ALS
+#!/bin/bash
+#unamestr=`uname -m`
+if [ -z ${JAVA_HOME} ]; then
+	echo "Please set JAVA_HOME!"
+	exit 1
+else
+	echo "use existing JAVA_HOME " $JAVA_HOME
+fi
+
+if [ -z ${CUDA_ROOT} ]; then
+	echo "Please set CUDA_ROOT to the cuda installation, say /usr/local/cuda !"
+	exit 1
+else
+	echo "use existing CUDA_ROOT " $CUDA_ROOT
+fi
+
+echo "compile the cuda & native code"
+$CUDA_ROOT/bin/nvcc -shared  -D_USE_GPU_ -I/usr/include -I$JAVA_HOME/include -I$JAVA_HOME/include/linux ../utilities.cu src/cuda/als.cu src/CuMFJNIInterface.cpp -o libGPUALS.so -Xcompiler "-fPIC" -m64  -use_fast_math -rdc=true -gencode arch=compute_35,code=sm_35 -gencode arch=compute_35,code=compute_35 -O3 -Xptxas -dlcm=ca -L{$CUDA_ROOT}/lib64 -lcublas -lcusparse    
+
+#echo "build spark"
+#SPARK_HOME=../../Spark-MLlib/
+#cd $SPARK_HOME
+#build/mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
+
+#echo "build spark distribution"
+#cd $SPARK_HOME
+#./dev/make-distribution.sh -Pnetlib-lgpl -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.2 
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "org_apache_spark_ml_recommendation_CuMFJNIInterface.h"
+#include "cuda/als.h"
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../utilities.h"
+
+JNIEXPORT jobjectArray JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_doALSWithCSR
+  (JNIEnv * env, jobject obj, jint m, jint n, jint f, jint nnz, jdouble lambda, jobjectArray sortedSrcFactors, jintArray csrRow, jintArray csrCol, jfloatArray csrVal){
+	//checkCudaErrors(cudaSetDevice(1));
+	//use multiple GPUs
+	//select a GPU for *this* specific dataset
+   int whichGPU = get_gpu();
+   checkCudaErrors(cudaSetDevice(whichGPU));
+   cudaStream_t cuda_stream;
+   cudaStreamCreate(&cuda_stream);
+   /* check correctness
+    int csrRowlen = env->GetArrayLength(csrRow);
+	int csrCollen = env->GetArrayLength(csrCol);
+    int csrVallen = env->GetArrayLength(csrVal);
+	assert(csrRowlen == m + 1);
+	assert(csrCollen == nnz);
+	assert(csrVallen == nnz);
+	*/
+	int* csrRowIndexHostPtr;
+	int* csrColIndexHostPtr;
+	float* csrValHostPtr;
+	/*
+	printf("csrRow of len %d: ", len);
+    for (int i = 0; i < len; i++) {
+	  printf("%d ", body[i]);
+    }
+	printf("\n");
+	*/
+	//calculate X from thetaT
+	float* thetaTHost;
+	cudacall(cudaMallocHost( (void** ) &thetaTHost, n * f * sizeof(thetaTHost[0])) );
+	//to be returned
+	float* XTHost;
+	cudacall(cudaMallocHost( (void** ) &XTHost, m * f * sizeof(XTHost[0])) );
+	
+	int numSrcBlocks = env->GetArrayLength(sortedSrcFactors);
+	//WARNING: ReleaseFloatArrayElements and DeleteLocalRef are important; 
+	//Otherwise result is correct but performance is bad
+	int index = 0;
+	for(int i = 0; i < numSrcBlocks; i++){
+		jobject factorsPerBlock = env->GetObjectArrayElement(sortedSrcFactors, i);
+		int numFactors = env->GetArrayLength((jobjectArray)factorsPerBlock);
+		for(int j = 0; j < numFactors; j++){
+			jobject factor = env->GetObjectArrayElement((jobjectArray)factorsPerBlock, j);
+			jfloat *factorfloat = (jfloat *) env->GetPrimitiveArrayCritical( (jfloatArray)factor, 0);	
+			memcpy(thetaTHost + index*f, factorfloat, sizeof(float)*f);
+			index ++;
+			env->ReleasePrimitiveArrayCritical((jfloatArray)factor, factorfloat, 0);
+			env->DeleteLocalRef(factor);
+		}
+		env->DeleteLocalRef(factorsPerBlock);
+	}
+	// get a pointer to the raw input data, pinning them in memory
+	csrRowIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrRow, 0);
+	csrColIndexHostPtr = (jint*) env->GetPrimitiveArrayCritical(csrCol, 0);
+	csrValHostPtr = (jfloat*) env->GetPrimitiveArrayCritical(csrVal, 0);
+
+	/*
+	printf("thetaTHost of len %d: \n", n*f);
+    for (int i = 0; i < n*f; i++) {
+	  printf("%f ", thetaTHost[i]);
+    }
+	printf("\n");
+	*/
+	int * d_csrRowIndex = 0;
+	int * d_csrColIndex = 0;
+	float * d_csrVal = 0;
+
+	cudacall(cudaMalloc((void** ) &d_csrRowIndex,(m + 1) * sizeof(float)));
+	cudacall(cudaMalloc((void** ) &d_csrColIndex, nnz * sizeof(float)));
+	cudacall(cudaMalloc((void** ) &d_csrVal, nnz * sizeof(float)));
+	cudacall(cudaMemcpyAsync(d_csrRowIndex, csrRowIndexHostPtr,(size_t ) ((m + 1) * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream));
+	cudacall(cudaMemcpyAsync(d_csrColIndex, csrColIndexHostPtr,(size_t ) (nnz * sizeof(float)), cudaMemcpyHostToDevice, cuda_stream));
+	cudacall(cudaMemcpyAsync(d_csrVal, csrValHostPtr,(size_t ) (nnz * sizeof(float)),cudaMemcpyHostToDevice, cuda_stream));
+	cudaStreamSynchronize(cuda_stream);
+
+	// un-pin the host arrays, as we're done with them
+	env->ReleasePrimitiveArrayCritical(csrRow, csrRowIndexHostPtr, 0);
+	env->ReleasePrimitiveArrayCritical(csrCol, csrColIndexHostPtr, 0);
+	env->ReleasePrimitiveArrayCritical(csrVal, csrValHostPtr, 0);
+
+	printf("\tdoALSWithCSR with m=%d,n=%d,f=%d,nnz=%d,lambda=%f \n.", m, n, f, nnz, lambda);
+	try{
+		doALSWithCSR(cuda_stream, d_csrRowIndex, d_csrColIndex, d_csrVal, thetaTHost, XTHost, m, n, f, nnz, lambda, 1);
+	}
+	catch (thrust::system_error &e) {
+		printf("CUDA error during some_function: %s", e.what());
+		
+	}
+	jclass floatArrayClass =  env->FindClass("[F");
+	jobjectArray output = env->NewObjectArray(m, floatArrayClass, 0);
+    for (int i = 0; i < m; i++) {
+		jfloatArray col = env->NewFloatArray(f);
+		env->SetFloatArrayRegion(col, 0, f, XTHost + i*f);
+		env->SetObjectArrayElement(output, i, col);
+		env->DeleteLocalRef(col);
+    }
+	cudaFreeHost(thetaTHost);
+	cudaFreeHost(XTHost);
+	//TODO: stream create and destroy expensive?
+	checkCudaErrors(cudaStreamSynchronize(cuda_stream));
+	checkCudaErrors(cudaStreamDestroy(cuda_stream));
+	cudaCheckError();
+	return output;
+ }
+ 
+ JNIEXPORT void JNICALL Java_org_apache_spark_ml_recommendation_CuMFJNIInterface_testjni
+  (JNIEnv * env, jobject obj){
+	  printf("*******in native code of testjni ...\n");
+	  
+  }