From a934e0e9c2aba1e5e5f093d7375369e61d6b93bd Mon Sep 17 00:00:00 2001 From: Raffi Khatchadourian Date: Mon, 5 Feb 2024 17:40:13 -0500 Subject: [PATCH] Fix #140 (#144) * Fix https://github.com/wala/ML/issues/140. * Add test cases for https://github.com/wala/ML/issues/140. --- .../python/ml/test/TestTensorflow2Model.java | 65 +++++ .../ml/client/PythonTensorAnalysisEngine.java | 82 +++++- .../data/tensorboard_example.py | 248 ++++++++++++++++++ .../data/tf2_test_dataset11.py | 18 ++ .../data/tf2_test_dataset12.py | 23 ++ .../data/tf2_test_dataset13.py | 22 ++ .../data/tf2_test_dataset14.py | 21 ++ .../data/tf2_test_dataset15.py | 21 ++ 8 files changed, 499 insertions(+), 1 deletion(-) create mode 100644 com.ibm.wala.cast.python.test/data/tensorboard_example.py create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py create mode 100644 com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py diff --git a/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java b/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java index 7ff5b1db6..72771249e 100644 --- a/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java +++ b/com.ibm.wala.cast.python.ml.test/source/com/ibm/wala/cast/python/ml/test/TestTensorflow2Model.java @@ -866,6 +866,71 @@ public void testDataset10() test("tf2_test_dataset10.py", "add", 2, 2, 2, 3); } + /** + * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the + * tuple returned isn't a tensor. + */ + @Test + public void testDataset11() + throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { + test("tf2_test_dataset11.py", "f", 0, 0); + test("tf2_test_dataset11.py", "g", 1, 1, 2); + } + + /** + * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the + * tuple returned isn't a tensor. + */ + @Test + public void testDataset12() + throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { + test("tf2_test_dataset12.py", "f", 0, 0); + test("tf2_test_dataset12.py", "g", 1, 1, 2); + } + + /** + * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the + * tuple returned isn't a tensor. + */ + @Test + public void testDataset13() + throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { + test("tf2_test_dataset13.py", "f", 0, 0); + test("tf2_test_dataset13.py", "g", 1, 1, 2); + } + + /** + * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the + * tuple returned isn't a tensor. + */ + @Test + public void testDataset14() + throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { + test("tf2_test_dataset14.py", "f", 0, 0); + test("tf2_test_dataset14.py", "g", 1, 1, 2); + } + + /** + * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the + * tuple returned isn't a tensor. + */ + @Test + public void testDataset15() + throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { + test("tf2_test_dataset14.py", "f", 0, 0); + test("tf2_test_dataset14.py", "g", 1, 1, 2); + } + + /** + * Test enumerating a dataset (https://github.com/wala/ML/issues/140). The first element of the + * tuple returned isn't a tensor. + */ + @Test + public void testTensorboardExample() + throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { + test("tensorboard_example.py", "summarize_weights", 0, 12); + } + @Test public void testTensorList() throws ClassHierarchyException, IllegalArgumentException, CancelException, IOException { diff --git a/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java b/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java index 58fc006ba..702126a7f 100644 --- a/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java +++ b/com.ibm.wala.cast.python.ml/source/com/ibm/wala/cast/python/ml/client/PythonTensorAnalysisEngine.java @@ -9,6 +9,7 @@ import com.ibm.wala.cast.python.client.PythonAnalysisEngine; import com.ibm.wala.cast.python.ml.analysis.TensorTypeAnalysis; import com.ibm.wala.cast.python.ml.types.TensorType; +import com.ibm.wala.cast.python.ssa.PythonInvokeInstruction; import com.ibm.wala.cast.python.ssa.PythonPropertyRead; import com.ibm.wala.cast.python.types.PythonTypes; import com.ibm.wala.cast.types.AstMethodReference; @@ -19,6 +20,8 @@ import com.ibm.wala.ipa.callgraph.CGNode; import com.ibm.wala.ipa.callgraph.CallGraph; import com.ibm.wala.ipa.callgraph.propagation.AllocationSiteInNode; +import com.ibm.wala.ipa.callgraph.propagation.ConcreteTypeKey; +import com.ibm.wala.ipa.callgraph.propagation.ConstantKey; import com.ibm.wala.ipa.callgraph.propagation.InstanceKey; import com.ibm.wala.ipa.callgraph.propagation.LocalPointerKey; import com.ibm.wala.ipa.callgraph.propagation.PointerAnalysis; @@ -88,6 +91,12 @@ public class PythonTensorAnalysisEngine extends PythonAnalysisEngine errorLog = HashMapFactory.make(); private static Set getDataflowSources( @@ -275,7 +284,7 @@ private static boolean processInstructionInterprocedurally( IClass concreteType = asin.getConcreteType(); TypeReference reference = concreteType.getReference(); - if (reference.equals(DATASET)) { + if (reference.equals(DATASET) && isDatasetTensorElement(src, use, node, pointerAnalysis)) { sources.add(src); logger.info("Added dataflow source from tensor dataset: " + src + "."); return true; @@ -286,6 +295,77 @@ private static boolean processInstructionInterprocedurally( return false; } + /** + * Returns true iff the given {@link PointsToSetVariable} refers to a tensor dataset element of + * the dataset defined by the given value number in the given {@link CGNode}. + * + * @param src The {@link PointsToSetVariable} to consider. + * @param val The value in the given {@link CGNode} representing the tensor dataset. + * @param node The {@link CGNode} containing the given {@link PointsToSetVariable} and value. + * @param pointerAnalysis The {@link PointerAnalysis} that includes points-to information for the + * given {@link CGNode}. + * @return True iff src refers to a tensor dataset element defined by the dataset represented by + * val in node. + */ + private static boolean isDatasetTensorElement( + PointsToSetVariable src, int val, CGNode node, PointerAnalysis pointerAnalysis) { + SSAInstruction def = node.getDU().getDef(val); + + if (def instanceof PythonInvokeInstruction) { + PythonInvokeInstruction invokeInstruction = (PythonInvokeInstruction) def; + + // Check whether we are calling enumerate(), as that returns a tuple. + // Get the invoked function. + int invocationUse = invokeInstruction.getUse(0); + + PointerKey invocationUsePointerKey = + pointerAnalysis.getHeapModel().getPointerKeyForLocal(node, invocationUse); + + for (InstanceKey functionInstance : pointerAnalysis.getPointsToSet(invocationUsePointerKey)) { + if (functionInstance instanceof ConcreteTypeKey) { + ConcreteTypeKey typeKey = (ConcreteTypeKey) functionInstance; + IClass type = typeKey.getType(); + TypeReference typeReference = type.getReference(); + + if (typeReference.equals(ENUMERATE.getDeclaringClass())) { + // it's a call to enumerate(), where the returned value is an iterator over + // tuples. Each tuple consists of the enumeration number and the dataset + // element. Check that we are not looking at the enumeration number. + + PythonPropertyRead srcDef = + (PythonPropertyRead) + node.getDU().getDef(((LocalPointerKey) src.getPointerKey()).getValueNumber()); + + // What does the member reference point to? + PointerKey memberRefPointerKey = + pointerAnalysis.getHeapModel().getPointerKeyForLocal(node, srcDef.getMemberRef()); + + for (InstanceKey memberInstance : pointerAnalysis.getPointsToSet(memberRefPointerKey)) { + ConstantKey constant = (ConstantKey) memberInstance; + Object value = constant.getValue(); + + // if it's the first tuple element. + if (value.equals(0)) { + // Now that we know it's the first tuple element, we now need to know whether it's + // the first tuple, i.e., the one returned by enumerate. + // To do that, we examine the object being referenced on the RHS. + + SSAInstruction objRefDef = node.getDU().getDef(srcDef.getObjectRef()); + + // If the object being read is that of the dataset, we know that this is the first + // tuple read of the result of enumerate() on the dataset. + if (objRefDef instanceof PythonPropertyRead + && ((PythonPropertyRead) objRefDef).getObjectRef() == val) return false; + } + } + } + } + } + } + + return true; + } + /** * True iff the given {@link SSAInstruction} constitutes individual elements. * diff --git a/com.ibm.wala.cast.python.test/data/tensorboard_example.py b/com.ibm.wala.cast.python.test/data/tensorboard_example.py new file mode 100644 index 000000000..408ae591f --- /dev/null +++ b/com.ibm.wala.cast.python.test/data/tensorboard_example.py @@ -0,0 +1,248 @@ +# %% +""" +## Tensorboard +Graph, Loss, Accuracy & Weights visualization using Tensorboard and TensorFlow v2. This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/). + +- Author: Aymeric Damien +- Project: https://github.com/aymericdamien/TensorFlow-Examples/ +""" + +# %% + +import tensorflow as tf +import numpy as np + +from scripts.utils import write_csv +import timeit + +# %% +# Path to save logs into. +logs_path = "/tmp/tensorflow_logs/example/" + +# MNIST dataset parameters. +num_classes = 10 # total classes (0-9 digits). +num_features = 784 # data features (img shape: 28*28). + +# Training parameters. +learning_rate = 0.001 +training_steps = 3000 +batch_size = 256 +display_step = 100 + +# Network parameters. +n_hidden_1 = 128 # 1st layer number of neurons. +n_hidden_2 = 256 # 2nd layer number of neurons. + +# %% +# Prepare MNIST data. +from tensorflow.keras.datasets import mnist + +(x_train, y_train), (x_test, y_test) = mnist.load_data() +# Convert to float32. +x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32) +# Flatten images to 1-D vector of 784 features (28*28). +x_train, x_test = x_train.reshape([-1, num_features]), x_test.reshape( + [-1, num_features] +) +# Normalize images value from [0, 255] to [0, 1]. +x_train, x_test = x_train / 255.0, x_test / 255.0 + +# %% +# Use tf.data API to shuffle and batch data. +train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1) + +start_time = timeit.default_timer() +skipped_time = 0 + +# %% +# Store layers weight & bias + +# A random value generator to initialize weights. +random_normal = tf.initializers.RandomNormal() + +weights = { + "h1_weights": tf.Variable( + random_normal([num_features, n_hidden_1]), name="h1_weights" + ), + "h2_weights": tf.Variable( + random_normal([n_hidden_1, n_hidden_2]), name="h2_weights" + ), + "logits_weights": tf.Variable( + random_normal([n_hidden_2, num_classes]), name="logits_weights" + ), +} +biases = { + "h1_bias": tf.Variable(tf.zeros([n_hidden_1]), name="h1_bias"), + "h2_bias": tf.Variable(tf.zeros([n_hidden_2]), name="h2_bias"), + "logits_bias": tf.Variable(tf.zeros([num_classes]), name="logits_bias"), +} + +# %% +# Construct model and encapsulating all ops into scopes, making +# Tensorboard's Graph visualization more convenient. + + +# The computation graph to be traced. +@tf.function +def neural_net(x): + with tf.name_scope("Model"): + with tf.name_scope("HiddenLayer1"): + # Hidden fully connected layer with 128 neurons. + layer_1 = tf.add(tf.matmul(x, weights["h1_weights"]), biases["h1_bias"]) + # Apply sigmoid to layer_1 output for non-linearity. + layer_1 = tf.nn.sigmoid(layer_1) + with tf.name_scope("HiddenLayer2"): + # Hidden fully connected layer with 256 neurons. + layer_2 = tf.add( + tf.matmul(layer_1, weights["h2_weights"]), biases["h2_bias"] + ) + # Apply sigmoid to layer_2 output for non-linearity. + layer_2 = tf.nn.sigmoid(layer_2) + with tf.name_scope("LogitsLayer"): + # Output fully connected layer with a neuron for each class. + out_layer = ( + tf.matmul(layer_2, weights["logits_weights"]) + biases["logits_bias"] + ) + # Apply softmax to normalize the logits to a probability distribution. + out_layer = tf.nn.softmax(out_layer) + return out_layer + + +# %% +# Cross-Entropy loss function. +def cross_entropy(y_pred, y_true): + with tf.name_scope("CrossEntropyLoss"): + # Encode label to a one hot vector. + y_true = tf.one_hot(y_true, depth=num_classes) + # Clip prediction values to avoid log(0) error. + y_pred = tf.clip_by_value(y_pred, 1e-9, 1.0) + # Compute cross-entropy. + return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred))) + + +# Accuracy metric. +def accuracy(y_pred, y_true): + with tf.name_scope("Accuracy"): + # Predicted class is the index of highest score in prediction vector (i.e. argmax). + correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64)) + return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1) + + +# Stochastic gradient descent optimizer. +with tf.name_scope("Optimizer"): + optimizer = tf.optimizers.SGD(learning_rate) + + +# %% +# Optimization process. +def run_optimization(x, y): + # Wrap computation inside a GradientTape for automatic differentiation. + with tf.GradientTape() as g: + pred = neural_net(x) + loss = cross_entropy(pred, y) + + # Variables to update, i.e. trainable variables. + trainable_variables = list(weights.values()) + list(biases.values()) + + # Compute gradients. + gradients = g.gradient(loss, trainable_variables) + + # Update weights/biases following gradients. + optimizer.apply_gradients(list(zip(gradients, trainable_variables))) + + +# %% +# Visualize weights & biases as histogram in Tensorboard. +def summarize_weights(step): + for w in weights: + tf.summary.histogram(w.replace("_", "/"), weights[w], step=step) + for b in biases: + tf.summary.histogram(b.replace("_", "/"), biases[b], step=step) + + +# %% +# Create a Summary Writer to log the metrics to Tensorboad. +summary_writer = tf.summary.create_file_writer(logs_path) + +total_loss = 0 +loss_count = 0 + +total_accuracy = 0 +accuracy_count = 0 + +# %% +# Run training for the given number of steps. +for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1): + + # Start to trace the computation graph. The computation graph remains + # the same at each step, so we just need to export it once. + if step == 1: + tf.summary.trace_on(graph=True, profiler=True) + + # Run the optimization (computation graph). + run_optimization(batch_x, batch_y) + + # Export the computation graph to tensorboard after the first + # computation step was performed. + if step == 1: + with summary_writer.as_default(): + tf.summary.trace_export(name="trace", step=0, profiler_outdir=logs_path) + + if step % display_step == 0: + pred = neural_net(batch_x) + loss = cross_entropy(pred, batch_y) + total_loss += loss + loss_count += 1 + acc = accuracy(pred, batch_y) + total_accuracy += acc + accuracy_count += 1 + print_time = timeit.default_timer() + print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc)) + skipped_time += timeit.default_timer() - print_time + + # Write loss/acc metrics & weights to Tensorboard every few steps, + # to avoid storing too much data. + with summary_writer.as_default(): + tf.summary.scalar("loss", loss, step=step) + tf.summary.scalar("accuracy", acc, step=step) + summarize_weights(step) + +time = timeit.default_timer() - start_time - skipped_time +avg_loss = float(total_loss) / float(loss_count) +avg_accuracy = float(total_accuracy) / float(accuracy_count) + +write_csv(__file__, training_steps, float(avg_accuracy), float(avg_loss), time) + +# %% +""" +### Run Tensorboard + +To run tensorboard, run the following command in your terminal: +``` +tensorboard --logdir=/tmp/tensorflow_logs +``` + +And then connect your web browser to: [http://localhost:6006](http://localhost:6006) + +""" + +# %% +""" +![tensorboard1](../../../resources/img/tf2/tensorboard1.png) +""" + +# %% +""" +![tensorboard2](../../../resources/img/tf2/tensorboard2.png) +""" + +# %% +""" +![tensorboard3](../../../resources/img/tf2/tensorboard3.png) +""" + +# %% +""" +![tensorboard4](../../../resources/img/tf2/tensorboard4.png) +""" diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py new file mode 100644 index 000000000..8b1610b65 --- /dev/null +++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset11.py @@ -0,0 +1,18 @@ +# Test enumerate. The first element of the tuple returned isn't a tensor. + +import tensorflow as tf + + +def f(a): + pass + + +def g(a): + pass + + +dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]) + +for step, element in enumerate(dataset, 1): + f(step) + g(element) diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py new file mode 100644 index 000000000..272924b89 --- /dev/null +++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset12.py @@ -0,0 +1,23 @@ +# Test enumerate. The first element of the tuple returned isn't a tensor. + +import tensorflow as tf + + +def f(a): + pass + + +def g(a): + pass + + +dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]) + + +def h(): + for step, element in enumerate(dataset, 1): + f(step) + g(element) + + +h() diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py new file mode 100644 index 000000000..8af364489 --- /dev/null +++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset13.py @@ -0,0 +1,22 @@ +# Test enumerate. The first element of the tuple returned isn't a tensor. + +import tensorflow as tf + + +def f(a): + pass + + +def g(a): + pass + + +def h(): + dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]) + + for step, element in enumerate(dataset, 1): + f(step) + g(element) + + +h() diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py new file mode 100644 index 000000000..18a67bac3 --- /dev/null +++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset14.py @@ -0,0 +1,21 @@ +# Test enumerate. The first element of the tuple returned isn't a tensor. + +import tensorflow as tf + + +def f(a): + pass + + +def g(a): + pass + + +def h(ds): + for step, element in enumerate(ds, 1): + f(step) + g(element) + + +dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]) +h(dataset) diff --git a/com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py b/com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py new file mode 100644 index 000000000..85b88b94c --- /dev/null +++ b/com.ibm.wala.cast.python.test/data/tf2_test_dataset15.py @@ -0,0 +1,21 @@ +# Test enumerate. The first element of the tuple returned isn't a tensor. + +import tensorflow as tf + + +def f(a): + pass + + +def g(a): + pass + + +def h(eds): + for step, element in eds: + f(step) + g(element) + + +dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]) +h(enumerate(dataset, 1))