Merge pull request jegonzal#96 from dbickson/master

dbickson · dbickson · commit 343704a9f1d8 · 2013-11-17T04:31:33.000-08:00
cleaning svd output and improving and simplifying documentation
diff --git a/toolkits/collaborative_filtering/collaborative_filtering.dox b/toolkits/collaborative_filtering/collaborative_filtering.dox
@@ -696,9 +696,10 @@ cat A2
 
 Ceate a directory named A2, and inside it put the file A2.
 
+
 Here is an for running SVD (using one mpi node, one core)
 \verbatim
-bickson@thrust:~/graphlab2.1/graphlabapi/debug/toolkits/collaborative_filtering$ ./svd A2 --rows=3 --cols=4 --nsv=4 --nv=4 --max_iter=3 --ncpus=1 --quiet=1 --save_vectors=1
+bickson@thrust:~/graphlab2.1/graphlabapi/debug/toolkits/collaborative_filtering$ ./svd A2 --rows=3 --cols=4 --nsv=3 --nv=4 --max_iter=3 --quiet=1 --save_vectors=1 --predictions=out
 TCP Communication layer constructed.
 Loading graph.
 Loading graph. Finished in 0.004996
@@ -740,7 +741,7 @@ mpiexec -n XX ./svd [ rest of the command line aguments ]
 
 \verbatim
 --training  Input file directory.
---nv  Number of inner steps of each iterations. Typically the number should be greater than the number of singular values you look for. When nv is higher, accuracy will be higher, but running time slower.
+--nv  Buffer size of vectors. Typically the buffer size should be greater than the number of singular values you look for. When nv is higher, accuracy will be higher, but running time slower.
 --nsv Number of singular values requested. Should be typically less than --nv
 --ortho_repeats Number of repeats on the orthogonalization step. Default is 1 (no repeats). Increase this number for higher accuracy but slower execution. Maximal allowed values is 3.
 --max_iter  Number of allowed restarts. The minimum is 2= no restart. When max_iter is higher, the result will be more accurate, but the running slower. 
@@ -755,15 +756,9 @@ Note: for improving accuracy tol should be reduced. max_iter and nv should be in
 On default, the singular values will be written to an output file. When using --save_vectors=1 the singular vectors of the matrices U and V will be written into file as well.
 Here is an example of the output files created by the A2 example:
 \verbatim
--rw-r--r--   1 bickson  staff        224 Oct  8 13:12 singular_values
--rw-r--r--   1 bickson  staff        102 Oct  8 13:12 V.3_1_of_1
--rw-r--r--   1 bickson  staff         90 Oct  8 13:12 V.2_1_of_1
--rw-r--r--   1 bickson  staff         89 Oct  8 13:12 V.1_1_of_1
--rw-r--r--   1 bickson  staff         92 Oct  8 13:12 V.0_1_of_1
--rw-r--r--   1 bickson  staff         56 Oct  8 13:12 U.3_1_of_1
--rw-r--r--   1 bickson  staff         66 Oct  8 13:12 U.2_1_of_1
--rw-r--r--   1 bickson  staff         68 Oct  8 13:12 U.1_1_of_1
--rw-r--r--   1 bickson  staff         69 Oct  8 13:12 U.0_1_of_1
+-rw-r--r--   1 bickson  staff       136 Nov 17 14:19 outsingular_values
+-rw-r--r--   1 bickson  staff       353 Nov 17 14:19 out.V_1_of_1
+-rw-r--r--   1 bickson  staff       244 Nov 17 14:19 out.U_1_of_1
 \endverbatim
 <br>
 The singular_values file has a straightforward format:
@@ -775,20 +770,58 @@ The singular_values file has a straightforward format:
 1.69699593375e-64
 \endverbatim
 
-The singular vector file has the following format:
+Now let's compate GraphLba's output to Matlab execution:
 \verbatim
-> cat V.0_1_of_1
-0 -0.50467448230910661
-1 -0.46633647481132878
-2 -0.44142369037529217
-3 -0.57704339935747928
+>> A2
+ans =
+
+    0.8147    0.9134    0.2785    0.9649
+    0.9058    0.6324    0.5469    0.1576
+    0.1270    0.0975    0.9575    0.9706
+
+>>[u,d,v] = svd(A2)
+
+u =
+
+   -0.7019    0.2772    0.6561
+   -0.5018    0.4613   -0.7317
+   -0.5055   -0.8428   -0.1847
+
+
+d =
+
+    2.1610         0         0
+         0    0.9790         0
+         0         0    0.5542
+
+
+v =
+
+   -0.5047    0.5481   -0.2737
+   -0.4663    0.4726    0.2139
+   -0.4414   -0.4878   -0.7115
+   -0.5770   -0.4882    0.6108
 \endverbatim
 
-The first colum is the vector index, and the second column is the vector value.
-Note, that using multicore / distributed the vector columns may appear in a random order.
-In that case, you should sort them out, for example using the sort command:
+And here is GraphLab output:
 \verbatim
-sort -g -k 1,1 V.0_1_of_1 > V0
+#> cat out.U_1_of_1
+1 -0.70192004675202879 -0.27716662376092144 -0.6561150132717597 -5.4738221262688167e-48 
+2 -0.50180137502007927 -0.46130533561664677 0.73170538289640219 0 
+3 -0.50547366696553819 0.84283809240244056 0.18471686983009383 -1.3684555315672042e-48 
+
+#> cat out.V_1_of_1
+1 -0.50467448230910661 -0.54813128066725625 0.27370653410216472 -1.0321593257419978e-64 
+2 -0.46633647481132878 -0.47257174529014068 -0.21394855212543565 1.2159686437710111e-64 
+3 -0.44142369037529217 0.48778719783126245 0.71152275587696578 4.1893513895261429e-65 
+4 -0.57704339935747928 0.4881513027120063 -0.61077501430760717 -4.0044466804328635e-65 
+
+#> cat out.singular_values
+%%GraphLab SVD Solver library. This file contains the singular values.
+2.160971174556
+0.9790200922132
+0.5541592674291
+1.69699593375e-64
 \endverbatim
 
 \subsection SVD1 "Understanding the error measure"
diff --git a/toolkits/collaborative_filtering/svd.cpp b/toolkits/collaborative_filtering/svd.cpp
@@ -209,20 +209,21 @@ struct prediction_saver {
 struct linear_model_saver_U {
   typedef graph_type::vertex_type vertex_type;
   typedef graph_type::edge_type   edge_type;
-
-  int pos;
-  linear_model_saver_U(int pos): pos(pos) {}
-
+  /* save the linear model, using the format:
+     row_id/col_id factor1 factor2 ... factor_k \n
+     ==> where k is the number of converged singular values
+  */
   std::string save_vertex(const vertex_type& vertex) const {
-    if (vertex.id() < (uint)info.rows){
-      std::string ret;
-      if(use_ids)
-        ret = boost::lexical_cast<std::string>(vertex.id() + input_file_offset) + " ";
-      ret += boost::lexical_cast<std::string>(vertex.data().pvec[pos]) + "\n";
+    if (vertex.id() < rows){
+      std::string ret = boost::lexical_cast<std::string>(vertex.id()+input_file_offset) + " ";
+      for (uint i=0; i< nconv; i++)
+        ret += boost::lexical_cast<std::string>(vertex.data().pvec[i]) + " ";
+      ret += "\n";
       return ret;
     }
     else return "";
   }
+ 
   std::string save_edge(const edge_type& edge) const {
     return "";
   }
@@ -231,30 +232,25 @@ struct linear_model_saver_U {
 struct linear_model_saver_V {
   typedef graph_type::vertex_type vertex_type;
   typedef graph_type::edge_type   edge_type;
-
-  int pos;
-  linear_model_saver_V(int pos): pos(pos) {}
-
+  /* save the linear model, using the format:
+     nodeid factor1 factor2 ... factorNLATENT \n
+  */
   std::string save_vertex(const vertex_type& vertex) const {
-    if ((vertex.id() >= (uint)info.rows) || info.is_square()){
-      int rpos = pos;
-      if (info.is_square())
-          rpos += data_size;
-      std::string ret;
-      if(use_ids)
-        ret = boost::lexical_cast<std::string>(vertex.id()-rows+input_file_offset) + " ";
-      ret += boost::lexical_cast<std::string>(vertex.data().pvec[rpos]) + "\n";
+    if (vertex.id() >= rows){
+      std::string ret = boost::lexical_cast<std::string>(vertex.id()-rows+input_file_offset) + " ";
+      for (uint i=0; i< nconv; i++)
+        ret += boost::lexical_cast<std::string>(vertex.data().pvec[i]) + " ";
+      ret += "\n";
       return ret;
-    }
-    else return "";
   }
+  else return "";
+}
   std::string save_edge(const edge_type& edge) const {
     return "";
   }
 }; 
 
 
-
 /**
  * \brief The graph loader function is a line parser used for
  * distributed graph construction.
@@ -543,22 +539,21 @@ void lanczos(bipartite_graph_descriptor & info, timer & mytimer, vec & errest,
   END_TRACEPOINT(svd_error2);
 
   if (save_vectors){
-    BEGIN_TRACEPOINT(svd_vectors);
     if (nconv == 0)
       logstream(LOG_FATAL)<<"No converged vectors. Aborting the save operation" << std::endl;
+    if (predictions == "")
+      logstream(LOG_FATAL)<<"Please specify prediction output fie name using the --predictions=filename command"<<std::endl;
 
+    BEGIN_TRACEPOINT(svd_vectors);
     std::cout << "Saving singular value triplets to files: " << predictions << ".U.* and "<< predictions << ".V.*" <<std::endl;
     const bool gzip_output = false;
     const bool save_vertices = false;
     const bool save_edges = true;
     const size_t threads_per_machine = 1;
-    //save the linear model
-    for (int i=0; i < nsv; i++){
-      pgraph->save(predictions + "U." + boost::lexical_cast<std::string>(i), linear_model_saver_U(i),
+    pgraph->save(predictions + ".U", linear_model_saver_U(),
           gzip_output, save_edges, save_vertices, threads_per_machine);
-      pgraph->save(predictions + "V." + boost::lexical_cast<std::string>(i), linear_model_saver_V(i),
+      pgraph->save(predictions + ".V", linear_model_saver_V(),
           gzip_output, save_edges, save_vertices, threads_per_machine);
-    } 
     END_TRACEPOINT(svd_vectors);
   }
 
@@ -762,7 +757,7 @@ int main(int argc, char** argv) {
   vec errest;
   lanczos( info, timer, errest, vecfile);
 
-  write_output_vector(predictions + "singular_values", singular_values, false, "%GraphLab SVD Solver library. This file contains the singular values.");
+  write_output_vector(predictions + ".singular_values", singular_values, false, "%GraphLab SVD Solver library. This file contains the singular values.");
 
   const double runtime = timer.current_time();
   dc.cout() << "----------------------------------------------------------"