1717import com .google .common .primitives .Doubles ;
1818import com .medallia .word2vec .thrift .Word2VecModelThrift ;
1919import com .medallia .word2vec .util .Common ;
20- import org .apache .log4j .Logger ;
20+ import com .medallia .word2vec .util .ProfilingTimer ;
21+ import com .medallia .word2vec .util .AC ;
22+
2123
2224/**
2325 * Represents the Word2Vec model, containing vectors for each word
3133 * @see {@link #forSearch()}
3234 */
3335public class Word2VecModel {
34- static Logger logger = Logger .getLogger (Word2VecModel .class );
35-
3636 final List <String > vocab ;
3737 final int layerSize ;
3838 final double [] vectors ;
@@ -80,23 +80,35 @@ public static Word2VecModel fromTextFile(File file) throws IOException {
8080 }
8181
8282 /**
83- * Forwards to {@link #fromBinFile(File, ByteOrder)} with the default
84- * ByteOrder.LITTLE_ENDIAN
83+ * Forwards to {@link #fromBinFile(File, ByteOrder, ProfilingTimer )} with the default
84+ * ByteOrder.LITTLE_ENDIAN and no ProfilingTimer
8585 */
8686 public static Word2VecModel fromBinFile (File file )
8787 throws IOException {
88- return fromBinFile (file , ByteOrder .LITTLE_ENDIAN );
88+ return fromBinFile (file , ByteOrder .LITTLE_ENDIAN , ProfilingTimer .NONE );
89+ }
90+
91+ /**
92+ * Forwards to {@link #fromBinFile(File, ByteOrder, ProfilingTimer)} with no ProfilingTimer
93+ */
94+ public static Word2VecModel fromBinFile (File file , ByteOrder byteOrder )
95+ throws IOException {
96+ return fromBinFile (file , byteOrder , ProfilingTimer .NONE );
8997 }
9098
9199 /**
92100 * @return {@link Word2VecModel} created from the binary representation output
93101 * by the open source C version of word2vec using the given byte order.
94102 */
95- public static Word2VecModel fromBinFile (File file , ByteOrder byteOrder )
103+ public static Word2VecModel fromBinFile (File file , ByteOrder byteOrder , ProfilingTimer timer )
96104 throws IOException {
97105
98- try (FileInputStream fis = new FileInputStream (file )) {
106+ try (
107+ final FileInputStream fis = new FileInputStream (file );
108+ final AC ac = timer .start ("Loading vectors from bin file" )
109+ ) {
99110 final FileChannel channel = fis .getChannel ();
111+ timer .start ("Reading gigabyte #1" );
100112 MappedByteBuffer buffer =
101113 channel .map (
102114 FileChannel .MapMode .READ_ONLY ,
@@ -122,8 +134,10 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
122134
123135 final int vocabSize = Integer .parseInt (firstLine .substring (0 , index ));
124136 final int layerSize = Integer .parseInt (firstLine .substring (index + 1 ));
125- logger .info (
126- String .format ("Loading %d vectors with dimensionality %d" , vocabSize , layerSize ));
137+ timer .appendToLog (String .format (
138+ "Loading %d vectors with dimensionality %d" ,
139+ vocabSize ,
140+ layerSize ));
127141
128142 List <String > vocabs = new ArrayList <String >(vocabSize );
129143 double vectors [] = new double [vocabSize * layerSize ];
@@ -156,7 +170,7 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
156170 final long now = System .currentTimeMillis ();
157171 if (now - lastLogMessage > 1000 ) {
158172 final double percentage = ((double ) (lineno + 1 ) / (double ) vocabSize ) * 100.0 ;
159- logger . info (
173+ timer . appendToLog (
160174 String .format ("Loaded %d/%d vectors (%f%%)" , lineno + 1 , vocabSize , percentage ));
161175 lastLogMessage = now ;
162176 }
@@ -165,12 +179,11 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
165179 if (buffer .position () > ONE_GB ) {
166180 final int newPosition = (int ) (buffer .position () - ONE_GB );
167181 final long size = Math .min (channel .size () - ONE_GB * bufferCount , Integer .MAX_VALUE );
168- logger .debug (
169- String .format (
170- "Remapping for GB number %d. Start: %d, size: %d" ,
171- bufferCount ,
172- ONE_GB * bufferCount ,
173- size ));
182+ timer .endAndStart (
183+ "Reading gigabyte #%d. Start: %d, size: %d" ,
184+ bufferCount ,
185+ ONE_GB * bufferCount ,
186+ size );
174187 buffer = channel .map (
175188 FileChannel .MapMode .READ_ONLY ,
176189 ONE_GB * bufferCount ,
@@ -180,6 +193,7 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
180193 bufferCount += 1 ;
181194 }
182195 }
196+ timer .end ();
183197
184198 return new Word2VecModel (vocabs , layerSize , vectors );
185199 }
0 commit comments