From a934e0e9c2aba1e5e5f093d7375369e61d6b93bd Mon Sep 17 00:00:00 2001
From: Raffi Khatchadourian <khatchad@hunter.cuny.edu>
Date: Mon, 5 Feb 2024 17:40:13 -0500
Subject: [PATCH] Fix #140 (#144)

* Fix https://github.com/wala/ML/issues/140.

* Add test cases for https://github.com/wala/ML/issues/140.
---
 .../python/ml/test/TestTensorflow2Model.java  |  65 +++++
 .../ml/client/PythonTensorAnalysisEngine.java |  82 +++++-
 .../data/tensorboard_example.py               | 248 ++++++++++++++++++
 .../data/tf2_test_dataset11.py                |  18 ++
 .../data/tf2_test_dataset12.py                |  23 ++
 .../data/tf2_test_dataset13.py                |  22 ++
 .../data/tf2_test_dataset14.py                |  21 ++
 .../data/tf2_test_dataset15.py                |  21 ++
 8 files changed, 499 insertions(+), 1 deletion(-)
 create mode 100644 com.ibm.wala.cast.python.test/data/tensorboard_example.py
 create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py
 create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py
 create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py
 create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py
 create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py

diff --git a/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java b/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java
index 7ff5b1db6..72771249e 100644
--- a/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java
+++ b/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java
@@ -866,6 +866,71 @@ public void testDataset10()
     test("tf2_test_dataset10.py", "add", 2, 2, 2, 3);
   }
 
+  /**
+   * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the
+   * tuple returned isn't a tensor.
+   */
+  @Test
+  public void testDataset11()
+      throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
+    test("tf2_test_dataset11.py", "f", 0, 0);
+    test("tf2_test_dataset11.py", "g", 1, 1, 2);
+  }
+
+  /**
+   * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the
+   * tuple returned isn't a tensor.
+   */
+  @Test
+  public void testDataset12()
+      throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
+    test("tf2_test_dataset12.py", "f", 0, 0);
+    test("tf2_test_dataset12.py", "g", 1, 1, 2);
+  }
+
+  /**
+   * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the
+   * tuple returned isn't a tensor.
+   */
+  @Test
+  public void testDataset13()
+      throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
+    test("tf2_test_dataset13.py", "f", 0, 0);
+    test("tf2_test_dataset13.py", "g", 1, 1, 2);
+  }
+
+  /**
+   * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the
+   * tuple returned isn't a tensor.
+   */
+  @Test
+  public void testDataset14()
+      throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
+    test("tf2_test_dataset14.py", "f", 0, 0);
+    test("tf2_test_dataset14.py", "g", 1, 1, 2);
+  }
+
+  /**
+   * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the
+   * tuple returned isn't a tensor.
+   */
+  @Test
+  public void testDataset15()
+      throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
+    test("tf2_test_dataset14.py", "f", 0, 0);
+    test("tf2_test_dataset14.py", "g", 1, 1, 2);
+  }
+
+  /**
+   * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the
+   * tuple returned isn't a tensor.
+   */
+  @Test
+  public void testTensorboardExample()
+      throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
+    test("tensorboard_example.py", "summarize_weights", 0, 12);
+  }
+
   @Test
   public void testTensorList()
       throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException {
diff --git a/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java b/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java
index 58fc006ba..702126a7f 100644
--- a/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java
+++ b/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java
@@ -9,6 +9,7 @@
 import com.ibm.wala.cast.python.client.PythonAnalysisEngine;
 import com.ibm.wala.cast.python.ml.analysis.TensorTypeAnalysis;
 import com.ibm.wala.cast.python.ml.types.TensorType;
+import com.ibm.wala.cast.python.ssa.PythonInvokeInstruction;
 import com.ibm.wala.cast.python.ssa.PythonPropertyRead;
 import com.ibm.wala.cast.python.types.PythonTypes;
 import com.ibm.wala.cast.types.AstMethodReference;
@@ -19,6 +20,8 @@
 import com.ibm.wala.ipa.callgraph.CGNode;
 import com.ibm.wala.ipa.callgraph.CallGraph;
 import com.ibm.wala.ipa.callgraph.propagation.AllocationSiteInNode;
+import com.ibm.wala.ipa.callgraph.propagation.ConcreteTypeKey;
+import com.ibm.wala.ipa.callgraph.propagation.ConstantKey;
 import com.ibm.wala.ipa.callgraph.propagation.InstanceKey;
 import com.ibm.wala.ipa.callgraph.propagation.LocalPointerKey;
 import com.ibm.wala.ipa.callgraph.propagation.PointerAnalysis;
@@ -88,6 +91,12 @@ public class PythonTensorAnalysisEngine extends PythonAnalysisEngine<TensorTypeA
               TypeName.string2TypeName("Ltensorflow/functions/set_shape")),
           AstMethodReference.fnSelector);
 
+  private static final MethodReference ENUMERATE =
+      MethodReference.findOrCreate(
+          TypeReference.findOrCreate(
+              PythonTypes.pythonLoader, TypeName.string2TypeName("Lwala/builtin/enumerate")),
+          AstMethodReference.fnSelector);
+
   private final Map<PointerKey, AnalysisError> errorLog = HashMapFactory.make();
 
   private static Set<PointsToSetVariable> getDataflowSources(
@@ -275,7 +284,7 @@ private static boolean processInstructionInterprocedurally(
         IClass concreteType = asin.getConcreteType();
         TypeReference reference = concreteType.getReference();
 
-        if (reference.equals(DATASET)) {
+        if (reference.equals(DATASET) && isDatasetTensorElement(src, use, node, pointerAnalysis)) {
           sources.add(src);
           logger.info("Added dataflow source from tensor dataset: " + src + ".");
           return true;
@@ -286,6 +295,77 @@ private static boolean processInstructionInterprocedurally(
     return false;
   }
 
+  /**
+   * Returns true iff the given {@link PointsToSetVariable} refers to a tensor dataset element of
+   * the dataset defined by the given value number in the given {@link CGNode}.
+   *
+   * @param src The {@link PointsToSetVariable} to consider.
+   * @param val The value in the given {@link CGNode} representing the tensor dataset.
+   * @param node The {@link CGNode} containing the given {@link PointsToSetVariable} and value.
+   * @param pointerAnalysis The {@link PointerAnalysis} that includes points-to information for the
+   *     given {@link CGNode}.
+   * @return True iff src refers to a tensor dataset element defined by the dataset represented by
+   *     val in node.
+   */
+  private static boolean isDatasetTensorElement(
+      PointsToSetVariable src, int val, CGNode node, PointerAnalysis<InstanceKey> pointerAnalysis) {
+    SSAInstruction def = node.getDU().getDef(val);
+
+    if (def instanceof PythonInvokeInstruction) {
+      PythonInvokeInstruction invokeInstruction = (PythonInvokeInstruction) def;
+
+      // Check whether we are calling enumerate(), as that returns a tuple.
+      // Get the invoked function.
+      int invocationUse = invokeInstruction.getUse(0);
+
+      PointerKey invocationUsePointerKey =
+          pointerAnalysis.getHeapModel().getPointerKeyForLocal(node, invocationUse);
+
+      for (InstanceKey functionInstance : pointerAnalysis.getPointsToSet(invocationUsePointerKey)) {
+        if (functionInstance instanceof ConcreteTypeKey) {
+          ConcreteTypeKey typeKey = (ConcreteTypeKey) functionInstance;
+          IClass type = typeKey.getType();
+          TypeReference typeReference = type.getReference();
+
+          if (typeReference.equals(ENUMERATE.getDeclaringClass())) {
+            // it's a call to enumerate(), where the returned value is an iterator over
+            // tuples. Each tuple consists of the enumeration number and the dataset
+            // element. Check that we are not looking at the enumeration number.
+
+            PythonPropertyRead srcDef =
+                (PythonPropertyRead)
+                    node.getDU().getDef(((LocalPointerKey) src.getPointerKey()).getValueNumber());
+
+            // What does the member reference point to?
+            PointerKey memberRefPointerKey =
+                pointerAnalysis.getHeapModel().getPointerKeyForLocal(node, srcDef.getMemberRef());
+
+            for (InstanceKey memberInstance : pointerAnalysis.getPointsToSet(memberRefPointerKey)) {
+              ConstantKey<?> constant = (ConstantKey<?>) memberInstance;
+              Object value = constant.getValue();
+
+              // if it's the first tuple element.
+              if (value.equals(0)) {
+                // Now that we know it's the first tuple element, we now need to know whether it's
+                // the first tuple, i.e., the one returned by enumerate.
+                // To do that, we examine the object being referenced on the RHS.
+
+                SSAInstruction objRefDef = node.getDU().getDef(srcDef.getObjectRef());
+
+                // If the object being read is that of the dataset, we know that this is the first
+                // tuple read of the result of enumerate() on the dataset.
+                if (objRefDef instanceof PythonPropertyRead
+                    && ((PythonPropertyRead) objRefDef).getObjectRef() == val) return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
   /**
    * True iff the given {@link SSAInstruction} constitutes individual elements.
    *
diff --git a/com.ibm.wala.cast.python.test/data/tensorboard_example.py b/com.ibm.wala.cast.python.test/data/tensorboard_example.py
new file mode 100644
index 000000000..408ae591f
--- /dev/null
+++ b/com.ibm.wala.cast.python.test/data/tensorboard_example.py
@@ -0,0 +1,248 @@
+# %%
+"""
+## Tensorboard
+Graph, Loss, Accuracy & Weights visualization using Tensorboard and TensorFlow v2. This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/).
+
+- Author: Aymeric Damien
+- Project: https://github.com/aymericdamien/TensorFlow-Examples/
+"""
+
+# %%
+
+import tensorflow as tf
+import numpy as np
+
+from scripts.utils import write_csv
+import timeit
+
+# %%
+# Path to save logs into.
+logs_path = "/tmp/tensorflow_logs/example/"
+
+# MNIST dataset parameters.
+num_classes = 10  # total classes (0-9 digits).
+num_features = 784  # data features (img shape: 28*28).
+
+# Training parameters.
+learning_rate = 0.001
+training_steps = 3000
+batch_size = 256
+display_step = 100
+
+# Network parameters.
+n_hidden_1 = 128  # 1st layer number of neurons.
+n_hidden_2 = 256  # 2nd layer number of neurons.
+
+# %%
+# Prepare MNIST data.
+from tensorflow.keras.datasets import mnist
+
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+# Convert to float32.
+x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
+# Flatten images to 1-D vector of 784 features (28*28).
+x_train, x_test = x_train.reshape([-1, num_features]), x_test.reshape(
+    [-1, num_features]
+)
+# Normalize images value from [0, 255] to [0, 1].
+x_train, x_test = x_train / 255.0, x_test / 255.0
+
+# %%
+# Use tf.data API to shuffle and batch data.
+train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
+
+start_time = timeit.default_timer()
+skipped_time = 0
+
+# %%
+# Store layers weight & bias
+
+# A random value generator to initialize weights.
+random_normal = tf.initializers.RandomNormal()
+
+weights = {
+    "h1_weights": tf.Variable(
+        random_normal([num_features, n_hidden_1]), name="h1_weights"
+    ),
+    "h2_weights": tf.Variable(
+        random_normal([n_hidden_1, n_hidden_2]), name="h2_weights"
+    ),
+    "logits_weights": tf.Variable(
+        random_normal([n_hidden_2, num_classes]), name="logits_weights"
+    ),
+}
+biases = {
+    "h1_bias": tf.Variable(tf.zeros([n_hidden_1]), name="h1_bias"),
+    "h2_bias": tf.Variable(tf.zeros([n_hidden_2]), name="h2_bias"),
+    "logits_bias": tf.Variable(tf.zeros([num_classes]), name="logits_bias"),
+}
+
+# %%
+# Construct model and encapsulating all ops into scopes, making
+# Tensorboard's Graph visualization more convenient.
+
+
+# The computation graph to be traced.
+@tf.function
+def neural_net(x):
+    with tf.name_scope("Model"):
+        with tf.name_scope("HiddenLayer1"):
+            # Hidden fully connected layer with 128 neurons.
+            layer_1 = tf.add(tf.matmul(x, weights["h1_weights"]), biases["h1_bias"])
+            # Apply sigmoid to layer_1 output for non-linearity.
+            layer_1 = tf.nn.sigmoid(layer_1)
+        with tf.name_scope("HiddenLayer2"):
+            # Hidden fully connected layer with 256 neurons.
+            layer_2 = tf.add(
+                tf.matmul(layer_1, weights["h2_weights"]), biases["h2_bias"]
+            )
+            # Apply sigmoid to layer_2 output for non-linearity.
+            layer_2 = tf.nn.sigmoid(layer_2)
+        with tf.name_scope("LogitsLayer"):
+            # Output fully connected layer with a neuron for each class.
+            out_layer = (
+                tf.matmul(layer_2, weights["logits_weights"]) + biases["logits_bias"]
+            )
+            # Apply softmax to normalize the logits to a probability distribution.
+            out_layer = tf.nn.softmax(out_layer)
+    return out_layer
+
+
+# %%
+# Cross-Entropy loss function.
+def cross_entropy(y_pred, y_true):
+    with tf.name_scope("CrossEntropyLoss"):
+        # Encode label to a one hot vector.
+        y_true = tf.one_hot(y_true, depth=num_classes)
+        # Clip prediction values to avoid log(0) error.
+        y_pred = tf.clip_by_value(y_pred, 1e-9, 1.0)
+        # Compute cross-entropy.
+        return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred)))
+
+
+# Accuracy metric.
+def accuracy(y_pred, y_true):
+    with tf.name_scope("Accuracy"):
+        # Predicted class is the index of highest score in prediction vector (i.e. argmax).
+        correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
+        return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)
+
+
+# Stochastic gradient descent optimizer.
+with tf.name_scope("Optimizer"):
+    optimizer = tf.optimizers.SGD(learning_rate)
+
+
+# %%
+# Optimization process.
+def run_optimization(x, y):
+    # Wrap computation inside a GradientTape for automatic differentiation.
+    with tf.GradientTape() as g:
+        pred = neural_net(x)
+        loss = cross_entropy(pred, y)
+
+    # Variables to update, i.e. trainable variables.
+    trainable_variables = list(weights.values()) + list(biases.values())
+
+    # Compute gradients.
+    gradients = g.gradient(loss, trainable_variables)
+
+    # Update weights/biases following gradients.
+    optimizer.apply_gradients(list(zip(gradients, trainable_variables)))
+
+
+# %%
+# Visualize weights & biases as histogram in Tensorboard.
+def summarize_weights(step):
+    for w in weights:
+        tf.summary.histogram(w.replace("_", "/"), weights[w], step=step)
+    for b in biases:
+        tf.summary.histogram(b.replace("_", "/"), biases[b], step=step)
+
+
+# %%
+# Create a Summary Writer to log the metrics to Tensorboad.
+summary_writer = tf.summary.create_file_writer(logs_path)
+
+total_loss = 0
+loss_count = 0
+
+total_accuracy = 0
+accuracy_count = 0
+
+# %%
+# Run training for the given number of steps.
+for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
+
+    # Start to trace the computation graph. The computation graph remains
+    # the same at each step, so we just need to export it once.
+    if step == 1:
+        tf.summary.trace_on(graph=True, profiler=True)
+
+    # Run the optimization (computation graph).
+    run_optimization(batch_x, batch_y)
+
+    # Export the computation graph to tensorboard after the first
+    # computation step was performed.
+    if step == 1:
+        with summary_writer.as_default():
+            tf.summary.trace_export(name="trace", step=0, profiler_outdir=logs_path)
+
+    if step % display_step == 0:
+        pred = neural_net(batch_x)
+        loss = cross_entropy(pred, batch_y)
+        total_loss += loss
+        loss_count += 1
+        acc = accuracy(pred, batch_y)
+        total_accuracy += acc
+        accuracy_count += 1
+        print_time = timeit.default_timer()
+        print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))
+        skipped_time += timeit.default_timer() - print_time
+
+        # Write loss/acc metrics & weights to Tensorboard every few steps,
+        # to avoid storing too much data.
+        with summary_writer.as_default():
+            tf.summary.scalar("loss", loss, step=step)
+            tf.summary.scalar("accuracy", acc, step=step)
+            summarize_weights(step)
+
+time = timeit.default_timer() - start_time - skipped_time
+avg_loss = float(total_loss) / float(loss_count)
+avg_accuracy = float(total_accuracy) / float(accuracy_count)
+
+write_csv(__file__, training_steps, float(avg_accuracy), float(avg_loss), time)
+
+# %%
+"""
+### Run Tensorboard
+
+To run tensorboard, run the following command in your terminal:
+```
+tensorboard --logdir=/tmp/tensorflow_logs
+```
+
+And then connect your web browser to: [http://localhost:6006](http://localhost:6006)
+
+"""
+
+# %%
+"""
+![tensorboard1](../../../resources/img/tf2/tensorboard1.png)
+"""
+
+# %%
+"""
+![tensorboard2](../../../resources/img/tf2/tensorboard2.png)
+"""
+
+# %%
+"""
+![tensorboard3](../../../resources/img/tf2/tensorboard3.png)
+"""
+
+# %%
+"""
+![tensorboard4](../../../resources/img/tf2/tensorboard4.png)
+"""
diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py
new file mode 100644
index 000000000..8b1610b65
--- /dev/null
+++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py
@@ -0,0 +1,18 @@
+# Test enumerate. The first element of the tuple returned isn't a tensor.
+
+import tensorflow as tf
+
+
+def f(a):
+    pass
+
+
+def g(a):
+    pass
+
+
+dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+
+for step, element in enumerate(dataset, 1):
+    f(step)
+    g(element)
diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py
new file mode 100644
index 000000000..272924b89
--- /dev/null
+++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py
@@ -0,0 +1,23 @@
+# Test enumerate. The first element of the tuple returned isn't a tensor.
+
+import tensorflow as tf
+
+
+def f(a):
+    pass
+
+
+def g(a):
+    pass
+
+
+dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+
+
+def h():
+    for step, element in enumerate(dataset, 1):
+        f(step)
+        g(element)
+
+
+h()
diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py
new file mode 100644
index 000000000..8af364489
--- /dev/null
+++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py
@@ -0,0 +1,22 @@
+# Test enumerate. The first element of the tuple returned isn't a tensor.
+
+import tensorflow as tf
+
+
+def f(a):
+    pass
+
+
+def g(a):
+    pass
+
+
+def h():
+    dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+
+    for step, element in enumerate(dataset, 1):
+        f(step)
+        g(element)
+
+
+h()
diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py
new file mode 100644
index 000000000..18a67bac3
--- /dev/null
+++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py
@@ -0,0 +1,21 @@
+# Test enumerate. The first element of the tuple returned isn't a tensor.
+
+import tensorflow as tf
+
+
+def f(a):
+    pass
+
+
+def g(a):
+    pass
+
+
+def h(ds):
+    for step, element in enumerate(ds, 1):
+        f(step)
+        g(element)
+
+
+dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+h(dataset)
diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py
new file mode 100644
index 000000000..85b88b94c
--- /dev/null
+++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py
@@ -0,0 +1,21 @@
+# Test enumerate. The first element of the tuple returned isn't a tensor.
+
+import tensorflow as tf
+
+
+def f(a):
+    pass
+
+
+def g(a):
+    pass
+
+
+def h(eds):
+    for step, element in eds:
+        f(step)
+        g(element)
+
+
+dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+h(enumerate(dataset, 1))