train: Example of running a training loop in Java.

asimshankar · asimshankar · commit c27c760b2469 · 2017-12-13T18:45:06.000-08:00
diff --git a/train/README.md b/train/README.md
@@ -0,0 +1,52 @@
+# Training [TensorFlow](https://www.tensorflow.org) models in Java
+
+Python is the primary language in which TensorFlow models are typically
+developed and trained. TensorFlow does have [bindings for other programming
+languages](https://www.tensorflow.org/api_docs/). These bindings have the
+low-level primitives that are required to build a more complete API, however,
+lack much of the higher-level API richness of the Python bindings, particularly
+for defining the model structure.
+
+This file demonstrates taking a model (a TensorFlow graph) created by a Python
+program and running the training loop in Java (and saving the trained weights
+to disk).
+
+## The model
+
+The model is a trivial one, trying to learn the function: `f(x) = W\*x + b`,
+where `W` and `b` are model parameters. The training data is constructed so that
+the "true" value of `W` is 3 and that of `b` is 2, i.e., `f(x) = 3 * x + 2`.
+
+Thus, over time, the predicted value for an input of 1, 2, and 3 should tend
+towards 5, 8, and 11.
+
+## Quickstart
+
+1.  Run the training loop program in Java using:
+
+    ```
+    mvn compile exec:java -q -Dexec.args="graph.pb /tmp/checkpoint"
+    ```
+
+    Where `graph.pb` is the serialized TenosrFlow graph  and `/tmp/checkpoint`
+    is the directory from which trained weights (the checkpoint) should be
+    loaded (if available) and saved to (after training).
+
+## Generating the graph
+
+The `graph.pb` file which contains the model definition, and the names of the
+tensors in it were generated by running `python model.py`.
+
+
+## Noteworthy
+
+-   The Python APIs for TensorFlow include other conveniences for training (such
+    as `MonitoredSession` and `tf.train.Estimator`), which make it easier to
+    configure checkpointing, evaluation loops etc. The examples here aren't that
+    sophisticated and are focused on basic model training only.
+-   In this example, we use placeholders and feed dictionaries to feed input,
+    but you probably want to use the
+    [`tf.data`](https://www.tensorflow.org/programmers_guide/datasets) API to
+    cconstruct an input pipeline for providing training data to the model.
+-   Not demonstrated here, but summaries for TensorBoard can also be produced by
+    executing the summary operations.
diff --git a/train/graph.pb b/train/graph.pb
diff --git a/train/model.py b/train/model.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+
+# Batch of input and target output (1x1 matrices)
+x = tf.placeholder(tf.float32, shape=[None, 1, 1], name='input')
+y = tf.placeholder(tf.float32, shape=[None, 1, 1], name='target')
+
+# Trivial linear model
+y_ = tf.identity(tf.layers.dense(x, 1), name='output')
+
+# Optimize loss
+loss = tf.reduce_mean(tf.square(y_ - y), name='loss')
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+train_op = optimizer.minimize(loss, name='train')
+
+init = tf.global_variables_initializer()
+
+# tf.train.Saver.__init__ adds operations to the graph to save
+# and restore variables.
+saver_def = tf.train.Saver().as_saver_def()
+
+print('Run this operation to initialize variables     : ', init.name)
+print('Run this operation for a train step            : ', train_op.name)
+print('Feed this tensor to set the checkpoint filename: ', saver_def.filename_tensor_name)
+print('Run this operation to save a checkpoint        : ', saver_def.save_tensor_name)
+print('Run this operation to restore a checkpoint     : ', saver_def.restore_op_name)
+
+# Write the graph out to a file.
+with open('graph.pb', 'w') as f:
+  f.write(tf.get_default_graph().as_graph_def().SerializeToString())
diff --git a/train/pom.xml b/train/pom.xml
@@ -0,0 +1,25 @@
+<project>
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>org.myorg</groupId>
+    <artifactId>train</artifactId>
+    <version>1.0-SNAPSHOT</version>
+    <properties>
+      <exec.mainClass>Train</exec.mainClass>
+      <!-- The sample code requires at least JDK 1.7. -->
+      <!-- The maven compiler plugin defaults to a lower version -->
+      <maven.compiler.source>1.7</maven.compiler.source>
+      <maven.compiler.target>1.7</maven.compiler.target>
+    </properties>
+    <dependencies>
+      <dependency>
+        <groupId>org.tensorflow</groupId>
+        <artifactId>tensorflow</artifactId>
+        <version>1.4.0</version>
+      </dependency>
+      <dependency>
+        <groupId>org.tensorflow</groupId>
+        <artifactId>proto</artifactId>
+        <version>1.4.0</version>
+      </dependency>
+    </dependencies>
+</project>
diff --git a/train/src/main/java/Train.java b/train/src/main/java/Train.java
@@ -0,0 +1,91 @@
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Random;
+import org.tensorflow.Graph;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.Tensors;
+
+public class Train {
+
+  public static void main(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println("Require two arguments: <graph_def_filename> <directory_for_checkpoints>");
+      System.exit(1);
+    }
+    final byte[] graphDef = Files.readAllBytes(Paths.get(args[0]));
+    final String checkpointDir = args[1];
+    final boolean checkpointExists = Files.exists(Paths.get(checkpointDir));
+
+    // These names of tensors/operations in the graph (string arguments to feed(), fetch(), and
+    // addTarget()) would have been printed out by model.py
+    try (Graph graph = new Graph();
+        Session sess = new Session(graph);
+        Tensor<String> checkpointPrefix =
+            Tensors.create(Paths.get(checkpointDir, "checkpoint").toString())) {
+      graph.importGraphDef(graphDef);
+
+      // Initialize or restore.
+      if (checkpointExists) {
+        System.out.println("Restoring variables from checkpoint");
+        sess.runner().feed("save/Const", checkpointPrefix).addTarget("save/restore_all").run();
+      } else {
+        System.out.println("Initializing variables");
+        sess.runner().addTarget("init").run();
+      }
+
+      System.out.println("Generating initial predictions");
+      printPredictionsOnTestSet(sess);
+
+      System.out.println("Training for a few steps");
+      final int BATCH_SIZE = 10;
+      float inputs[][][] = new float[BATCH_SIZE][1][1];
+      float targets[][][] = new float[BATCH_SIZE][1][1];
+      for (int i = 0; i < 200; ++i) {
+        fillNextBatchForTraining(inputs, targets);
+        try (Tensor<Float> inputBatch = Tensors.create(inputs);
+            Tensor<Float> targetBatch = Tensors.create(targets)) {
+          sess.runner()
+              .feed("input", inputBatch)
+              .feed("target", targetBatch)
+              .addTarget("train")
+              .run();
+        }
+      }
+
+      System.out.println("Updated predictions");
+      printPredictionsOnTestSet(sess);
+
+      System.out.println("Saving checkpoint");
+      sess.runner().feed("save/Const", checkpointPrefix).addTarget("save/control_dependency").run();
+    }
+  }
+
+  public static void printPredictionsOnTestSet(Session sess) {
+    final float[][][] inputBatch = new float[][][] {{{1.0f}}, {{2.0f}}, {{3.0f}}};
+    try (Tensor<Float> input = Tensors.create(inputBatch);
+        Tensor<Float> output =
+            sess.runner().feed("input", input).fetch("output").run().get(0).expect(Float.class)) {
+      final long shape[] = output.shape();
+      final int batchSize = (int) shape[0];
+      final int rows = (int) shape[1];
+      final int cols = (int) shape[2];
+      float[][][] predictions = output.copyTo(new float[batchSize][rows][cols]);
+      for (int i = 0; i < batchSize; ++i) {
+        System.out.print("\t x = ");
+        System.out.print(Arrays.deepToString(inputBatch[i]));
+        System.out.print(", predicted y = ");
+        System.out.println(Arrays.deepToString(predictions[i]));
+      }
+    }
+  }
+
+  public static void fillNextBatchForTraining(float[][][] inputs, float[][][] targets) {
+    final Random r = new Random();
+    for (int i = 0; i < inputs.length; ++i) {
+      inputs[i][0][0] = r.nextFloat();
+      targets[i][0][0] = inputs[i][0][0] * 3.0f + 2.0f;
+    }
+  }
+}