Merge pull request #15 from dirkgr/LoadFast

wko27 · wko27 · commit 57bbe5c11935 · 2015-05-12T09:39:32.000-07:00
Load binary models faster
diff --git a/src/main/java/com/medallia/word2vec/Word2VecModel.java b/src/main/java/com/medallia/word2vec/Word2VecModel.java
@@ -1,45 +1,49 @@
 package com.medallia.word2vec;
 
-import java.io.DataInput;
-import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.commons.io.input.SwappedDataInputStream;
-
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import com.google.common.primitives.Doubles;
 import com.medallia.word2vec.thrift.Word2VecModelThrift;
 import com.medallia.word2vec.util.Common;
+import com.medallia.word2vec.util.ProfilingTimer;
+import com.medallia.word2vec.util.AC;
+
 
 /**
  * Represents the Word2Vec model, containing vectors for each word
- * <p>
+ * <p/>
  * Instances of this class are obtained via:
  * <ul>
  * <li> {@link #trainer()}
  * <li> {@link #fromThrift(Word2VecModelThrift)}
  * </ul>
- * 
+ *
  * @see {@link #forSearch()}
  */
 public class Word2VecModel {
 	final List<String> vocab;
 	final int layerSize;
 	final double[] vectors;
-	
+	private final static long ONE_GB = 1024 * 1024 * 1024;
+
 	Word2VecModel(Iterable<String> vocab, int layerSize, double[] vectors) {
 		this.vocab = ImmutableList.copyOf(vocab);
 		this.layerSize = layerSize;
 		this.vectors = vectors;
 	}
-	
+
 	/** @return Vocabulary */
 	public Iterable<String> getVocab() {
 		return vocab;
@@ -49,15 +53,15 @@ public Iterable<String> getVocab() {
 	public Searcher forSearch() {
 		return new SearcherImpl(this);
 	}
-	
+
 	/** @return Serializable thrift representation */
 	public Word2VecModelThrift toThrift() {
 		return new Word2VecModelThrift()
-			.setVocab(vocab)
-			.setLayerSize(layerSize)
-			.setVectors(Doubles.asList(vectors));
+				.setVocab(vocab)
+				.setLayerSize(layerSize)
+				.setVectors(Doubles.asList(vectors));
 	}
-	
+
 	/** @return {@link Word2VecModel} created from a thrift representation */
 	public static Word2VecModel fromThrift(Word2VecModelThrift thrift) {
 		return new Word2VecModel(
@@ -76,69 +80,126 @@ public static Word2VecModel fromTextFile(File file) throws IOException {
 	}
 
 	/**
-   * Forwards to {@link #fromBinFile(File, ByteOrder)} with the default 
-   * ByteOrder.LITTLE_ENDIAN
-   */
-  public static Word2VecModel fromBinFile(File file)
-      throws IOException {
-    return fromBinFile(file, ByteOrder.LITTLE_ENDIAN);
-  }
-
-  /**
-   * @return {@link Word2VecModel} created from the binary representation output
-   * by the open source C version of word2vec using the given byte order.
-   */
-  public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
-      throws IOException {
-
-    try (FileInputStream fis = new FileInputStream(file);) {
-      DataInput in = (byteOrder == ByteOrder.BIG_ENDIAN) ?
-          new DataInputStream(fis) : new SwappedDataInputStream(fis);
-
-      StringBuilder sb = new StringBuilder();
-      char c = (char) in.readByte();
-      while (c != '\n') {
-        sb.append(c);
-        c = (char) in.readByte();
-      }
-      String firstLine = sb.toString();
-      int index = firstLine.indexOf(' ');
-      Preconditions.checkState(index != -1,
-          "Expected a space in the first line of file '%s': '%s'",
-          file.getAbsolutePath(), firstLine);
-
-      int vocabSize = Integer.parseInt(firstLine.substring(0, index));
-      int layerSize = Integer.parseInt(firstLine.substring(index + 1));
-
-      List<String> vocabs = Lists.newArrayList();
-      List<Double> vectors = Lists.newArrayList();
-
-      for (int lineno = 0; lineno < vocabSize; lineno++) {
-        sb = new StringBuilder();
-        c = (char) in.readByte();
-        while (c != ' ') {
-          // ignore newlines in front of words (some binary files have newline,
-          // some don't)
-          if (c != '\n') {
-            sb.append(c);
-          }
-          c = (char) in.readByte();
-        }
-        vocabs.add(sb.toString());
-
-        for (int i = 0; i < layerSize; i++) {
-          vectors.add((double) in.readFloat());
-        }
-      }
-
-      return fromThrift(new Word2VecModelThrift()
-          .setLayerSize(layerSize)
-          .setVocab(vocabs)
-          .setVectors(vectors));
-    }
-  }
-
-  /**
+	 * Forwards to {@link #fromBinFile(File, ByteOrder, ProfilingTimer)} with the default
+	 * ByteOrder.LITTLE_ENDIAN and no ProfilingTimer
+	 */
+	public static Word2VecModel fromBinFile(File file)
+			throws IOException {
+		return fromBinFile(file, ByteOrder.LITTLE_ENDIAN, ProfilingTimer.NONE);
+	}
+
+	/**
+	 * Forwards to {@link #fromBinFile(File, ByteOrder, ProfilingTimer)} with no ProfilingTimer
+	 */
+	public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder)
+			throws IOException {
+		return fromBinFile(file, byteOrder, ProfilingTimer.NONE);
+	}
+
+	/**
+	 * @return {@link Word2VecModel} created from the binary representation output
+	 * by the open source C version of word2vec using the given byte order.
+	 */
+	public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder, ProfilingTimer timer)
+			throws IOException {
+
+		try (
+				final FileInputStream fis = new FileInputStream(file);
+				final AC ac = timer.start("Loading vectors from bin file")
+		) {
+			final FileChannel channel = fis.getChannel();
+			timer.start("Reading gigabyte #1");
+			MappedByteBuffer buffer =
+					channel.map(
+							FileChannel.MapMode.READ_ONLY,
+							0,
+							Math.min(channel.size(), Integer.MAX_VALUE));
+			buffer.order(byteOrder);
+			int bufferCount = 1;
+			// Java's NIO only allows memory-mapping up to 2GB. To work around this problem, we re-map
+			// every gigabyte. To calculate offsets correctly, we have to keep track how many gigabytes
+			// we've already skipped. That's what this is for.
+
+			StringBuilder sb = new StringBuilder();
+			char c = (char) buffer.get();
+			while (c != '\n') {
+				sb.append(c);
+				c = (char) buffer.get();
+			}
+			String firstLine = sb.toString();
+			int index = firstLine.indexOf(' ');
+			Preconditions.checkState(index != -1,
+					"Expected a space in the first line of file '%s': '%s'",
+					file.getAbsolutePath(), firstLine);
+
+			final int vocabSize = Integer.parseInt(firstLine.substring(0, index));
+			final int layerSize = Integer.parseInt(firstLine.substring(index + 1));
+			timer.appendToLog(String.format(
+					"Loading %d vectors with dimensionality %d",
+					vocabSize,
+					layerSize));
+
+			List<String> vocabs = new ArrayList<String>(vocabSize);
+			double vectors[] = new double[vocabSize * layerSize];
+
+			long lastLogMessage = System.currentTimeMillis();
+			final float[] floats = new float[layerSize];
+			for (int lineno = 0; lineno < vocabSize; lineno++) {
+				// read vocab
+				sb.setLength(0);
+				c = (char) buffer.get();
+				while (c != ' ') {
+					// ignore newlines in front of words (some binary files have newline,
+					// some don't)
+					if (c != '\n') {
+						sb.append(c);
+					}
+					c = (char) buffer.get();
+				}
+				vocabs.add(sb.toString());
+
+				// read vector
+				final FloatBuffer floatBuffer = buffer.asFloatBuffer();
+				floatBuffer.get(floats);
+				for (int i = 0; i < floats.length; ++i) {
+					vectors[lineno * layerSize + i] = floats[i];
+				}
+				buffer.position(buffer.position() + 4 * layerSize);
+
+				// print log
+				final long now = System.currentTimeMillis();
+				if (now - lastLogMessage > 1000) {
+					final double percentage = ((double) (lineno + 1) / (double) vocabSize) * 100.0;
+					timer.appendToLog(
+							String.format("Loaded %d/%d vectors (%f%%)", lineno + 1, vocabSize, percentage));
+					lastLogMessage = now;
+				}
+
+				// remap file
+				if (buffer.position() > ONE_GB) {
+					final int newPosition = (int) (buffer.position() - ONE_GB);
+					final long size = Math.min(channel.size() - ONE_GB * bufferCount, Integer.MAX_VALUE);
+					timer.endAndStart(
+							"Reading gigabyte #%d. Start: %d, size: %d",
+							bufferCount,
+							ONE_GB * bufferCount,
+							size);
+					buffer = channel.map(
+							FileChannel.MapMode.READ_ONLY,
+							ONE_GB * bufferCount,
+							size);
+					buffer.order(byteOrder);
+					buffer.position(newPosition);
+					bufferCount += 1;
+				}
+			}
+			timer.end();
+
+			return new Word2VecModel(vocabs, layerSize, vectors);
+		}
+	}
+
+	/**
 	 * @return {@link Word2VecModel} from the lines of the file in the text output format of the
 	 * Word2Vec C open source project.
 	 */
@@ -155,7 +216,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
 				filename,
 				vocabSize,
 				lines.size() - 1
-			);
+		);
 
 		for (int n = 1; n < lines.size(); n++) {
 			String[] values = lines.get(n).split(" ");
@@ -169,7 +230,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
 					n,
 					layerSize,
 					values.length - 1
-				);
+			);
 
 			for (int d = 1; d < values.length; d++) {
 				vectors.add(Double.parseDouble(values[d]));
@@ -182,7 +243,7 @@ static Word2VecModel fromTextFile(String filename, List<String> lines) throws IO
 				.setVectors(vectors);
 		return fromThrift(thrift);
 	}
-	
+
 	/** @return {@link Word2VecTrainerBuilder} for training a model */
 	public static Word2VecTrainerBuilder trainer() {
 		return new Word2VecTrainerBuilder();