2727
2828/**
2929 * Represents the Word2Vec model, containing vectors for each word
30- * <p>
30+ * <p/ >
3131 * Instances of this class are obtained via:
3232 * <ul>
3333 * <li> {@link #trainer()}
3434 * <li> {@link #fromThrift(Word2VecModelThrift)}
3535 * </ul>
36- *
36+ *
3737 * @see {@link #forSearch()}
3838 */
3939public class Word2VecModel {
@@ -42,13 +42,13 @@ public class Word2VecModel {
4242 final List <String > vocab ;
4343 final int layerSize ;
4444 final double [] vectors ;
45-
45+
4646 Word2VecModel (Iterable <String > vocab , int layerSize , double [] vectors ) {
4747 this .vocab = ImmutableList .copyOf (vocab );
4848 this .layerSize = layerSize ;
4949 this .vectors = vectors ;
5050 }
51-
51+
5252 /** @return Vocabulary */
5353 public Iterable <String > getVocab () {
5454 return vocab ;
@@ -58,15 +58,15 @@ public Iterable<String> getVocab() {
5858 public Searcher forSearch () {
5959 return new SearcherImpl (this );
6060 }
61-
61+
6262 /** @return Serializable thrift representation */
6363 public Word2VecModelThrift toThrift () {
6464 return new Word2VecModelThrift ()
65- .setVocab (vocab )
66- .setLayerSize (layerSize )
67- .setVectors (Doubles .asList (vectors ));
65+ .setVocab (vocab )
66+ .setLayerSize (layerSize )
67+ .setVectors (Doubles .asList (vectors ));
6868 }
69-
69+
7070 /** @return {@link Word2VecModel} created from a thrift representation */
7171 public static Word2VecModel fromThrift (Word2VecModelThrift thrift ) {
7272 return new Word2VecModel (
@@ -85,22 +85,22 @@ public static Word2VecModel fromTextFile(File file) throws IOException {
8585 }
8686
8787 /**
88- * Forwards to {@link #fromBinFile(File, ByteOrder)} with the default
89- * ByteOrder.LITTLE_ENDIAN
90- */
91- public static Word2VecModel fromBinFile (File file )
92- throws IOException {
93- return fromBinFile (file , ByteOrder .LITTLE_ENDIAN );
94- }
95-
96- /**
97- * @return {@link Word2VecModel} created from the binary representation output
98- * by the open source C version of word2vec using the given byte order.
99- */
100- public static Word2VecModel fromBinFile (File file , ByteOrder byteOrder )
101- throws IOException {
102-
103- try (FileInputStream fis = new FileInputStream (file );) {
88+ * Forwards to {@link #fromBinFile(File, ByteOrder)} with the default
89+ * ByteOrder.LITTLE_ENDIAN
90+ */
91+ public static Word2VecModel fromBinFile (File file )
92+ throws IOException {
93+ return fromBinFile (file , ByteOrder .LITTLE_ENDIAN );
94+ }
95+
96+ /**
97+ * @return {@link Word2VecModel} created from the binary representation output
98+ * by the open source C version of word2vec using the given byte order.
99+ */
100+ public static Word2VecModel fromBinFile (File file , ByteOrder byteOrder )
101+ throws IOException {
102+
103+ try (FileInputStream fis = new FileInputStream (file );) {
104104 final FileChannel channel = fis .getChannel ();
105105 final long oneGB = 1024 * 1024 * 1024 ;
106106 MappedByteBuffer buffer =
@@ -110,66 +110,66 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
110110 Math .min (channel .size (), Integer .MAX_VALUE ));
111111 buffer .order (byteOrder );
112112 int bufferCount = 1 ;
113- // Java's NIO only allows memory-mapping up to 2GB. To work around this problem, we re-map
114- // every gigabyte. To calculate offsets correctly, we have to keep track how many gigabytes
115- // we've already skipped. That's what this is for.
116-
117- StringBuilder sb = new StringBuilder ();
118- char c = (char )buffer .get ();
119- while (c != '\n' ) {
120- sb .append (c );
121- c = (char )buffer .get ();
122- }
123- String firstLine = sb .toString ();
124- int index = firstLine .indexOf (' ' );
125- Preconditions .checkState (index != -1 ,
126- "Expected a space in the first line of file '%s': '%s'" ,
127- file .getAbsolutePath (), firstLine );
113+ // Java's NIO only allows memory-mapping up to 2GB. To work around this problem, we re-map
114+ // every gigabyte. To calculate offsets correctly, we have to keep track how many gigabytes
115+ // we've already skipped. That's what this is for.
116+
117+ StringBuilder sb = new StringBuilder ();
118+ char c = (char ) buffer .get ();
119+ while (c != '\n' ) {
120+ sb .append (c );
121+ c = (char ) buffer .get ();
122+ }
123+ String firstLine = sb .toString ();
124+ int index = firstLine .indexOf (' ' );
125+ Preconditions .checkState (index != -1 ,
126+ "Expected a space in the first line of file '%s': '%s'" ,
127+ file .getAbsolutePath (), firstLine );
128128
129129 final int vocabSize = Integer .parseInt (firstLine .substring (0 , index ));
130- final int layerSize = Integer .parseInt (firstLine .substring (index + 1 ));
130+ final int layerSize = Integer .parseInt (firstLine .substring (index + 1 ));
131131 logger .info (
132132 String .format ("Loading %d vectors with dimensionality %d" , vocabSize , layerSize ));
133133
134- List <String > vocabs = new ArrayList <String >(vocabSize );
135- double vectors [] = new double [vocabSize * layerSize ];
134+ List <String > vocabs = new ArrayList <String >(vocabSize );
135+ double vectors [] = new double [vocabSize * layerSize ];
136136
137137 long lastLogMessage = System .currentTimeMillis ();
138138 final float [] floats = new float [layerSize ];
139- for (int lineno = 0 ; lineno < vocabSize ; lineno ++) {
139+ for (int lineno = 0 ; lineno < vocabSize ; lineno ++) {
140140 // read vocab
141141 sb .setLength (0 );
142- c = (char )buffer .get ();
143- while (c != ' ' ) {
144- // ignore newlines in front of words (some binary files have newline,
145- // some don't)
146- if (c != '\n' ) {
147- sb .append (c );
148- }
149- c = (char )buffer .get ();
150- }
151- vocabs .add (sb .toString ());
142+ c = (char ) buffer .get ();
143+ while (c != ' ' ) {
144+ // ignore newlines in front of words (some binary files have newline,
145+ // some don't)
146+ if (c != '\n' ) {
147+ sb .append (c );
148+ }
149+ c = (char ) buffer .get ();
150+ }
151+ vocabs .add (sb .toString ());
152152
153153 // read vector
154154 final FloatBuffer floatBuffer = buffer .asFloatBuffer ();
155155 floatBuffer .get (floats );
156- for (int i = 0 ; i < floats .length ; ++i ) {
156+ for (int i = 0 ; i < floats .length ; ++i ) {
157157 vectors [lineno * layerSize + i ] = floats [i ];
158158 }
159159 buffer .position (buffer .position () + 4 * layerSize );
160160
161161 // print log
162162 final long now = System .currentTimeMillis ();
163- if (now - lastLogMessage > 1000 ) {
164- final double percentage = ((double )(lineno + 1 ) / (double )vocabSize ) * 100.0 ;
163+ if (now - lastLogMessage > 1000 ) {
164+ final double percentage = ((double ) (lineno + 1 ) / (double ) vocabSize ) * 100.0 ;
165165 logger .info (
166166 String .format ("Loaded %d/%d vectors (%f%%)" , lineno + 1 , vocabSize , percentage ));
167167 lastLogMessage = now ;
168168 }
169169
170170 // remap file
171- if (buffer .position () > oneGB ) {
172- final int newPosition = (int )(buffer .position () - oneGB );
171+ if (buffer .position () > oneGB ) {
172+ final int newPosition = (int ) (buffer .position () - oneGB );
173173 final long size = Math .min (channel .size () - oneGB * bufferCount , Integer .MAX_VALUE );
174174 logger .debug (
175175 String .format (
@@ -185,13 +185,13 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
185185 buffer .position (newPosition );
186186 bufferCount += 1 ;
187187 }
188- }
188+ }
189189
190- return new Word2VecModel (vocabs , layerSize , vectors );
191- }
192- }
190+ return new Word2VecModel (vocabs , layerSize , vectors );
191+ }
192+ }
193193
194- /**
194+ /**
195195 * @return {@link Word2VecModel} from the lines of the file in the text output format of the
196196 * Word2Vec C open source project.
197197 */
@@ -208,7 +208,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
208208 filename ,
209209 vocabSize ,
210210 lines .size () - 1
211- );
211+ );
212212
213213 for (int n = 1 ; n < lines .size (); n ++) {
214214 String [] values = lines .get (n ).split (" " );
@@ -222,7 +222,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
222222 n ,
223223 layerSize ,
224224 values .length - 1
225- );
225+ );
226226
227227 for (int d = 1 ; d < values .length ; d ++) {
228228 vectors .add (Double .parseDouble (values [d ]));
@@ -235,7 +235,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
235235 .setVectors (vectors );
236236 return fromThrift (thrift );
237237 }
238-
238+
239239 /** @return {@link Word2VecTrainerBuilder} for training a model */
240240 public static Word2VecTrainerBuilder trainer () {
241241 return new Word2VecTrainerBuilder ();
0 commit comments