Skip to content

Commit 343704a

Browse files
committed
Merge pull request jegonzal#96 from dbickson/master
cleaning svd output and improving and simplifying documentation
2 parents 8ac750d + 9dccfec commit 343704a

File tree

2 files changed

+80
-52
lines changed

2 files changed

+80
-52
lines changed

toolkits/collaborative_filtering/collaborative_filtering.dox

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -696,9 +696,10 @@ cat A2
696696

697697
Ceate a directory named A2, and inside it put the file A2.
698698

699+
699700
Here is an for running SVD (using one mpi node, one core)
700701
\verbatim
701-
bickson@thrust:~/graphlab2.1/graphlabapi/debug/toolkits/collaborative_filtering$ ./svd A2 --rows=3 --cols=4 --nsv=4 --nv=4 --max_iter=3 --ncpus=1 --quiet=1 --save_vectors=1
702+
bickson@thrust:~/graphlab2.1/graphlabapi/debug/toolkits/collaborative_filtering$ ./svd A2 --rows=3 --cols=4 --nsv=3 --nv=4 --max_iter=3 --quiet=1 --save_vectors=1 --predictions=out
702703
TCP Communication layer constructed.
703704
Loading graph.
704705
Loading graph. Finished in 0.004996
@@ -740,7 +741,7 @@ mpiexec -n XX ./svd [ rest of the command line aguments ]
740741

741742
\verbatim
742743
--training Input file directory.
743-
--nv Number of inner steps of each iterations. Typically the number should be greater than the number of singular values you look for. When nv is higher, accuracy will be higher, but running time slower.
744+
--nv Buffer size of vectors. Typically the buffer size should be greater than the number of singular values you look for. When nv is higher, accuracy will be higher, but running time slower.
744745
--nsv Number of singular values requested. Should be typically less than --nv
745746
--ortho_repeats Number of repeats on the orthogonalization step. Default is 1 (no repeats). Increase this number for higher accuracy but slower execution. Maximal allowed values is 3.
746747
--max_iter Number of allowed restarts. The minimum is 2= no restart. When max_iter is higher, the result will be more accurate, but the running slower.
@@ -755,15 +756,9 @@ Note: for improving accuracy tol should be reduced. max_iter and nv should be in
755756
On default, the singular values will be written to an output file. When using --save_vectors=1 the singular vectors of the matrices U and V will be written into file as well.
756757
Here is an example of the output files created by the A2 example:
757758
\verbatim
758-
-rw-r--r-- 1 bickson staff 224 Oct 8 13:12 singular_values
759-
-rw-r--r-- 1 bickson staff 102 Oct 8 13:12 V.3_1_of_1
760-
-rw-r--r-- 1 bickson staff 90 Oct 8 13:12 V.2_1_of_1
761-
-rw-r--r-- 1 bickson staff 89 Oct 8 13:12 V.1_1_of_1
762-
-rw-r--r-- 1 bickson staff 92 Oct 8 13:12 V.0_1_of_1
763-
-rw-r--r-- 1 bickson staff 56 Oct 8 13:12 U.3_1_of_1
764-
-rw-r--r-- 1 bickson staff 66 Oct 8 13:12 U.2_1_of_1
765-
-rw-r--r-- 1 bickson staff 68 Oct 8 13:12 U.1_1_of_1
766-
-rw-r--r-- 1 bickson staff 69 Oct 8 13:12 U.0_1_of_1
759+
-rw-r--r-- 1 bickson staff 136 Nov 17 14:19 outsingular_values
760+
-rw-r--r-- 1 bickson staff 353 Nov 17 14:19 out.V_1_of_1
761+
-rw-r--r-- 1 bickson staff 244 Nov 17 14:19 out.U_1_of_1
767762
\endverbatim
768763
<br>
769764
The singular_values file has a straightforward format:
@@ -775,20 +770,58 @@ The singular_values file has a straightforward format:
775770
1.69699593375e-64
776771
\endverbatim
777772

778-
The singular vector file has the following format:
773+
Now let's compate GraphLba's output to Matlab execution:
779774
\verbatim
780-
> cat V.0_1_of_1
781-
0 -0.50467448230910661
782-
1 -0.46633647481132878
783-
2 -0.44142369037529217
784-
3 -0.57704339935747928
775+
>> A2
776+
ans =
777+
778+
0.8147 0.9134 0.2785 0.9649
779+
0.9058 0.6324 0.5469 0.1576
780+
0.1270 0.0975 0.9575 0.9706
781+
782+
>>[u,d,v] = svd(A2)
783+
784+
u =
785+
786+
-0.7019 0.2772 0.6561
787+
-0.5018 0.4613 -0.7317
788+
-0.5055 -0.8428 -0.1847
789+
790+
791+
d =
792+
793+
2.1610 0 0
794+
0 0.9790 0
795+
0 0 0.5542
796+
797+
798+
v =
799+
800+
-0.5047 0.5481 -0.2737
801+
-0.4663 0.4726 0.2139
802+
-0.4414 -0.4878 -0.7115
803+
-0.5770 -0.4882 0.6108
785804
\endverbatim
786805

787-
The first colum is the vector index, and the second column is the vector value.
788-
Note, that using multicore / distributed the vector columns may appear in a random order.
789-
In that case, you should sort them out, for example using the sort command:
806+
And here is GraphLab output:
790807
\verbatim
791-
sort -g -k 1,1 V.0_1_of_1 > V0
808+
#> cat out.U_1_of_1
809+
1 -0.70192004675202879 -0.27716662376092144 -0.6561150132717597 -5.4738221262688167e-48
810+
2 -0.50180137502007927 -0.46130533561664677 0.73170538289640219 0
811+
3 -0.50547366696553819 0.84283809240244056 0.18471686983009383 -1.3684555315672042e-48
812+
813+
#> cat out.V_1_of_1
814+
1 -0.50467448230910661 -0.54813128066725625 0.27370653410216472 -1.0321593257419978e-64
815+
2 -0.46633647481132878 -0.47257174529014068 -0.21394855212543565 1.2159686437710111e-64
816+
3 -0.44142369037529217 0.48778719783126245 0.71152275587696578 4.1893513895261429e-65
817+
4 -0.57704339935747928 0.4881513027120063 -0.61077501430760717 -4.0044466804328635e-65
818+
819+
#> cat out.singular_values
820+
%%GraphLab SVD Solver library. This file contains the singular values.
821+
2.160971174556
822+
0.9790200922132
823+
0.5541592674291
824+
1.69699593375e-64
792825
\endverbatim
793826

794827
\subsection SVD1 "Understanding the error measure"

toolkits/collaborative_filtering/svd.cpp

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -209,20 +209,21 @@ struct prediction_saver {
209209
struct linear_model_saver_U {
210210
typedef graph_type::vertex_type vertex_type;
211211
typedef graph_type::edge_type edge_type;
212-
213-
int pos;
214-
linear_model_saver_U(int pos): pos(pos) {}
215-
212+
/* save the linear model, using the format:
213+
row_id/col_id factor1 factor2 ... factor_k \n
214+
==> where k is the number of converged singular values
215+
*/
216216
std::string save_vertex(const vertex_type& vertex) const {
217-
if (vertex.id() < (uint)info.rows){
218-
std::string ret;
219-
if(use_ids)
220-
ret = boost::lexical_cast<std::string>(vertex.id() + input_file_offset) + " ";
221-
ret += boost::lexical_cast<std::string>(vertex.data().pvec[pos]) + "\n";
217+
if (vertex.id() < rows){
218+
std::string ret = boost::lexical_cast<std::string>(vertex.id()+input_file_offset) + " ";
219+
for (uint i=0; i< nconv; i++)
220+
ret += boost::lexical_cast<std::string>(vertex.data().pvec[i]) + " ";
221+
ret += "\n";
222222
return ret;
223223
}
224224
else return "";
225225
}
226+
226227
std::string save_edge(const edge_type& edge) const {
227228
return "";
228229
}
@@ -231,30 +232,25 @@ struct linear_model_saver_U {
231232
struct linear_model_saver_V {
232233
typedef graph_type::vertex_type vertex_type;
233234
typedef graph_type::edge_type edge_type;
234-
235-
int pos;
236-
linear_model_saver_V(int pos): pos(pos) {}
237-
235+
/* save the linear model, using the format:
236+
nodeid factor1 factor2 ... factorNLATENT \n
237+
*/
238238
std::string save_vertex(const vertex_type& vertex) const {
239-
if ((vertex.id() >= (uint)info.rows) || info.is_square()){
240-
int rpos = pos;
241-
if (info.is_square())
242-
rpos += data_size;
243-
std::string ret;
244-
if(use_ids)
245-
ret = boost::lexical_cast<std::string>(vertex.id()-rows+input_file_offset) + " ";
246-
ret += boost::lexical_cast<std::string>(vertex.data().pvec[rpos]) + "\n";
239+
if (vertex.id() >= rows){
240+
std::string ret = boost::lexical_cast<std::string>(vertex.id()-rows+input_file_offset) + " ";
241+
for (uint i=0; i< nconv; i++)
242+
ret += boost::lexical_cast<std::string>(vertex.data().pvec[i]) + " ";
243+
ret += "\n";
247244
return ret;
248-
}
249-
else return "";
250245
}
246+
else return "";
247+
}
251248
std::string save_edge(const edge_type& edge) const {
252249
return "";
253250
}
254251
};
255252

256253

257-
258254
/**
259255
* \brief The graph loader function is a line parser used for
260256
* distributed graph construction.
@@ -543,22 +539,21 @@ void lanczos(bipartite_graph_descriptor & info, timer & mytimer, vec & errest,
543539
END_TRACEPOINT(svd_error2);
544540

545541
if (save_vectors){
546-
BEGIN_TRACEPOINT(svd_vectors);
547542
if (nconv == 0)
548543
logstream(LOG_FATAL)<<"No converged vectors. Aborting the save operation" << std::endl;
544+
if (predictions == "")
545+
logstream(LOG_FATAL)<<"Please specify prediction output fie name using the --predictions=filename command"<<std::endl;
549546

547+
BEGIN_TRACEPOINT(svd_vectors);
550548
std::cout << "Saving singular value triplets to files: " << predictions << ".U.* and "<< predictions << ".V.*" <<std::endl;
551549
const bool gzip_output = false;
552550
const bool save_vertices = false;
553551
const bool save_edges = true;
554552
const size_t threads_per_machine = 1;
555-
//save the linear model
556-
for (int i=0; i < nsv; i++){
557-
pgraph->save(predictions + "U." + boost::lexical_cast<std::string>(i), linear_model_saver_U(i),
553+
pgraph->save(predictions + ".U", linear_model_saver_U(),
558554
gzip_output, save_edges, save_vertices, threads_per_machine);
559-
pgraph->save(predictions + "V." + boost::lexical_cast<std::string>(i), linear_model_saver_V(i),
555+
pgraph->save(predictions + ".V", linear_model_saver_V(),
560556
gzip_output, save_edges, save_vertices, threads_per_machine);
561-
}
562557
END_TRACEPOINT(svd_vectors);
563558
}
564559

@@ -762,7 +757,7 @@ int main(int argc, char** argv) {
762757
vec errest;
763758
lanczos( info, timer, errest, vecfile);
764759

765-
write_output_vector(predictions + "singular_values", singular_values, false, "%GraphLab SVD Solver library. This file contains the singular values.");
760+
write_output_vector(predictions + ".singular_values", singular_values, false, "%GraphLab SVD Solver library. This file contains the singular values.");
766761

767762
const double runtime = timer.current_time();
768763
dc.cout() << "----------------------------------------------------------"

0 commit comments

Comments
 (0)