Skip to content

Commit fa4f8d5

Browse files
committed
Switch to all tabs
1 parent b33a7e7 commit fa4f8d5

File tree

1 file changed

+67
-67
lines changed

1 file changed

+67
-67
lines changed

src/main/java/com/medallia/word2vec/Word2VecModel.java

Lines changed: 67 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@
2727

2828
/**
2929
* Represents the Word2Vec model, containing vectors for each word
30-
* <p>
30+
* <p/>
3131
* Instances of this class are obtained via:
3232
* <ul>
3333
* <li> {@link #trainer()}
3434
* <li> {@link #fromThrift(Word2VecModelThrift)}
3535
* </ul>
36-
*
36+
*
3737
* @see {@link #forSearch()}
3838
*/
3939
public class Word2VecModel {
@@ -42,13 +42,13 @@ public class Word2VecModel {
4242
final List<String> vocab;
4343
final int layerSize;
4444
final double[] vectors;
45-
45+
4646
Word2VecModel(Iterable<String> vocab, int layerSize, double[] vectors) {
4747
this.vocab = ImmutableList.copyOf(vocab);
4848
this.layerSize = layerSize;
4949
this.vectors = vectors;
5050
}
51-
51+
5252
/** @return Vocabulary */
5353
public Iterable<String> getVocab() {
5454
return vocab;
@@ -58,15 +58,15 @@ public Iterable<String> getVocab() {
5858
public Searcher forSearch() {
5959
return new SearcherImpl(this);
6060
}
61-
61+
6262
/** @return Serializable thrift representation */
6363
public Word2VecModelThrift toThrift() {
6464
return new Word2VecModelThrift()
65-
.setVocab(vocab)
66-
.setLayerSize(layerSize)
67-
.setVectors(Doubles.asList(vectors));
65+
.setVocab(vocab)
66+
.setLayerSize(layerSize)
67+
.setVectors(Doubles.asList(vectors));
6868
}
69-
69+
7070
/** @return {@link Word2VecModel} created from a thrift representation */
7171
public static Word2VecModel fromThrift(Word2VecModelThrift thrift) {
7272
return new Word2VecModel(
@@ -85,22 +85,22 @@ public static Word2VecModel fromTextFile(File file) throws IOException {
8585
}
8686

8787
/**
88-
* Forwards to {@link #fromBinFile(File, ByteOrder)} with the default
89-
* ByteOrder.LITTLE_ENDIAN
90-
*/
91-
public static Word2VecModel fromBinFile(File file)
92-
throws IOException {
93-
return fromBinFile(file, ByteOrder.LITTLE_ENDIAN);
94-
}
95-
96-
/**
97-
* @return {@link Word2VecModel} created from the binary representation output
98-
* by the open source C version of word2vec using the given byte order.
99-
*/
100-
public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
101-
throws IOException {
102-
103-
try (FileInputStream fis = new FileInputStream(file);) {
88+
* Forwards to {@link #fromBinFile(File, ByteOrder)} with the default
89+
* ByteOrder.LITTLE_ENDIAN
90+
*/
91+
public static Word2VecModel fromBinFile(File file)
92+
throws IOException {
93+
return fromBinFile(file, ByteOrder.LITTLE_ENDIAN);
94+
}
95+
96+
/**
97+
* @return {@link Word2VecModel} created from the binary representation output
98+
* by the open source C version of word2vec using the given byte order.
99+
*/
100+
public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
101+
throws IOException {
102+
103+
try (FileInputStream fis = new FileInputStream(file);) {
104104
final FileChannel channel = fis.getChannel();
105105
final long oneGB = 1024 * 1024 * 1024;
106106
MappedByteBuffer buffer =
@@ -110,66 +110,66 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
110110
Math.min(channel.size(), Integer.MAX_VALUE));
111111
buffer.order(byteOrder);
112112
int bufferCount = 1;
113-
// Java's NIO only allows memory-mapping up to 2GB. To work around this problem, we re-map
114-
// every gigabyte. To calculate offsets correctly, we have to keep track how many gigabytes
115-
// we've already skipped. That's what this is for.
116-
117-
StringBuilder sb = new StringBuilder();
118-
char c = (char)buffer.get();
119-
while (c != '\n') {
120-
sb.append(c);
121-
c = (char)buffer.get();
122-
}
123-
String firstLine = sb.toString();
124-
int index = firstLine.indexOf(' ');
125-
Preconditions.checkState(index != -1,
126-
"Expected a space in the first line of file '%s': '%s'",
127-
file.getAbsolutePath(), firstLine);
113+
// Java's NIO only allows memory-mapping up to 2GB. To work around this problem, we re-map
114+
// every gigabyte. To calculate offsets correctly, we have to keep track how many gigabytes
115+
// we've already skipped. That's what this is for.
116+
117+
StringBuilder sb = new StringBuilder();
118+
char c = (char) buffer.get();
119+
while (c != '\n') {
120+
sb.append(c);
121+
c = (char) buffer.get();
122+
}
123+
String firstLine = sb.toString();
124+
int index = firstLine.indexOf(' ');
125+
Preconditions.checkState(index != -1,
126+
"Expected a space in the first line of file '%s': '%s'",
127+
file.getAbsolutePath(), firstLine);
128128

129129
final int vocabSize = Integer.parseInt(firstLine.substring(0, index));
130-
final int layerSize = Integer.parseInt(firstLine.substring(index + 1));
130+
final int layerSize = Integer.parseInt(firstLine.substring(index + 1));
131131
logger.info(
132132
String.format("Loading %d vectors with dimensionality %d", vocabSize, layerSize));
133133

134-
List<String> vocabs = new ArrayList<String>(vocabSize);
135-
double vectors[] = new double[vocabSize * layerSize];
134+
List<String> vocabs = new ArrayList<String>(vocabSize);
135+
double vectors[] = new double[vocabSize * layerSize];
136136

137137
long lastLogMessage = System.currentTimeMillis();
138138
final float[] floats = new float[layerSize];
139-
for (int lineno = 0; lineno < vocabSize; lineno++) {
139+
for (int lineno = 0; lineno < vocabSize; lineno++) {
140140
// read vocab
141141
sb.setLength(0);
142-
c = (char)buffer.get();
143-
while (c != ' ') {
144-
// ignore newlines in front of words (some binary files have newline,
145-
// some don't)
146-
if (c != '\n') {
147-
sb.append(c);
148-
}
149-
c = (char)buffer.get();
150-
}
151-
vocabs.add(sb.toString());
142+
c = (char) buffer.get();
143+
while (c != ' ') {
144+
// ignore newlines in front of words (some binary files have newline,
145+
// some don't)
146+
if (c != '\n') {
147+
sb.append(c);
148+
}
149+
c = (char) buffer.get();
150+
}
151+
vocabs.add(sb.toString());
152152

153153
// read vector
154154
final FloatBuffer floatBuffer = buffer.asFloatBuffer();
155155
floatBuffer.get(floats);
156-
for(int i = 0; i < floats.length; ++i) {
156+
for (int i = 0; i < floats.length; ++i) {
157157
vectors[lineno * layerSize + i] = floats[i];
158158
}
159159
buffer.position(buffer.position() + 4 * layerSize);
160160

161161
// print log
162162
final long now = System.currentTimeMillis();
163-
if(now - lastLogMessage > 1000) {
164-
final double percentage = ((double)(lineno + 1) / (double)vocabSize) * 100.0;
163+
if (now - lastLogMessage > 1000) {
164+
final double percentage = ((double) (lineno + 1) / (double) vocabSize) * 100.0;
165165
logger.info(
166166
String.format("Loaded %d/%d vectors (%f%%)", lineno + 1, vocabSize, percentage));
167167
lastLogMessage = now;
168168
}
169169

170170
// remap file
171-
if(buffer.position() > oneGB) {
172-
final int newPosition = (int)(buffer.position() - oneGB);
171+
if (buffer.position() > oneGB) {
172+
final int newPosition = (int) (buffer.position() - oneGB);
173173
final long size = Math.min(channel.size() - oneGB * bufferCount, Integer.MAX_VALUE);
174174
logger.debug(
175175
String.format(
@@ -185,13 +185,13 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
185185
buffer.position(newPosition);
186186
bufferCount += 1;
187187
}
188-
}
188+
}
189189

190-
return new Word2VecModel(vocabs, layerSize, vectors);
191-
}
192-
}
190+
return new Word2VecModel(vocabs, layerSize, vectors);
191+
}
192+
}
193193

194-
/**
194+
/**
195195
* @return {@link Word2VecModel} from the lines of the file in the text output format of the
196196
* Word2Vec C open source project.
197197
*/
@@ -208,7 +208,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
208208
filename,
209209
vocabSize,
210210
lines.size() - 1
211-
);
211+
);
212212

213213
for (int n = 1; n < lines.size(); n++) {
214214
String[] values = lines.get(n).split(" ");
@@ -222,7 +222,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
222222
n,
223223
layerSize,
224224
values.length - 1
225-
);
225+
);
226226

227227
for (int d = 1; d < values.length; d++) {
228228
vectors.add(Double.parseDouble(values[d]));
@@ -235,7 +235,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
235235
.setVectors(vectors);
236236
return fromThrift(thrift);
237237
}
238-
238+
239239
/** @return {@link Word2VecTrainerBuilder} for training a model */
240240
public static Word2VecTrainerBuilder trainer() {
241241
return new Word2VecTrainerBuilder();

0 commit comments

Comments
 (0)