Move confusion matrix from %%ml to library. (#159)

qimingj · qimingj · commit e117138bacfb · 2017-02-21T17:33:15.000-08:00
* Move confusion matrix from %%ml to library.

This is part of efforts to move %%ml magic stuff to library to provide a consistent experience (python only).

* Add a comment.
diff --git a/datalab/mlalpha/_confusion_matrix.py b/datalab/mlalpha/_confusion_matrix.py
@@ -11,57 +11,100 @@
 # the License.
 
 
-from plotly.offline import iplot
+import google.cloud.ml as ml
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.metrics import confusion_matrix
+
+import datalab.bigquery as bq
+import datalab.data as data
 
 
 class ConfusionMatrix(object):
   """Represents a confusion matrix."""
 
-  def __init__(self, predicted_labels, true_labels, counts):
-    """Initializes an instance of a ComfusionMatrix. the length of predicted_values,
-       true_values, count must be the same.
+  def __init__(self, cm, labels):
+    """
+    Args:
+      cm: a 2-dimensional matrix with row index being target, column index being predicted,
+          and values being count.
+      labels: the labels whose order matches the row/column indexes.
+    """
+    self._cm = cm
+    self._labels = labels
 
+  @staticmethod
+  def from_csv(input_csv, headers=None, schema_file=None):
+    """Create a ConfusionMatrix from a csv file.
     Args:
-      predicted_labels: a list of predicted labels.
-      true_labels: a list of true labels.
-      counts: a list of count for each (predicted, true) combination.
+      input_csv: Path to a Csv file (with no header). Can be local or GCS path.
+      headers: Csv headers. If present, it must include 'target' and 'predicted'.
+      schema_file: Path to a JSON file containing BigQuery schema. Used if "headers" is None.
+          If present, it must include 'target' and 'predicted' columns.
+    Returns:
+      A ConfusionMatrix that can be plotted.
+    Raises:
+      ValueError if both headers and schema_file are None, or it does not include 'target'
+          or 'predicted' columns.
+    """
 
-    Raises: Exception if predicted_labels, true_labels, and counts are not of the same size
+    if headers is not None:
+      names = headers
+    elif schema_file is not None:
+      with ml.util._file.open_local_or_gcs(schema_file, mode='r') as f:
+        schema = json.load(f)
+      names = [x['name'] for x in schema]
+    else:
+      raise ValueError('Either headers or schema_file is needed')
+    with ml.util._file.open_local_or_gcs(input_csv, mode='r') as f:
+      df = pd.read_csv(f, names=names)
+    if 'target' not in df or 'predicted' not in df:
+      raise ValueError('Cannot find "target" or "predicted" column')
+
+    labels = sorted(set(df['target']) | set(df['predicted']))
+    cm = confusion_matrix(df['target'], df['predicted'], labels=labels)
+    return ConfusionMatrix(cm, labels)
+
+  @staticmethod
+  def from_bigquery(sql):
+    """Create a ConfusionMatrix from a BigQuery table or query.
+    Args:
+      sql: Can be one of:
+          A SQL query string.
+          A SQL Query module defined with '%%sql --name [module_name]'.
+          A Bigquery table.
+      The query results or table must include "target", "predicted" columns.
+    Returns:
+      A ConfusionMatrix that can be plotted.
+    Raises:
+      ValueError if query results or table does not include 'target' or 'predicted' columns.
     """
-    if len(predicted_labels) != len(true_labels) or len(true_labels) != len(counts):
-      raise Exception('The input predicted_labels, true_labels, counts need to be same size.')
-    self._all_labels = list(set(predicted_labels) | set(true_labels))
-    data = []
-    for value in self._all_labels:
-      predicts_for_current_true_label = \
-          {p: c for p, t, c in zip(predicted_labels, true_labels, counts) if t == value}
-      # sort by all_values and fill in zeros if needed
-      predicts_for_current_true_label = [predicts_for_current_true_label.get(v, 0)
-          for v in self._all_labels]
-      data.append(predicts_for_current_true_label)
-    self._data = data
+
+    query, _ = data.SqlModule.get_sql_statement_with_environment(sql, {})
+    sql = ('SELECT target, predicted, count(*) as count FROM (%s) group by target, predicted'
+        % query.sql)
+    df = bq.Query(sql).results().to_dataframe()
+    labels = sorted(set(df['target']) | set(df['predicted']))
+    labels_count = len(labels)
+    df['target'] = [labels.index(x) for x in df['target']]
+    df['predicted'] = [labels.index(x) for x in df['predicted']]
+    cm = [[0]*labels_count for i in range(labels_count)]
+    for index, row in df.iterrows():
+      cm[row['target']][row['predicted']] = row['count']
+    return ConfusionMatrix(cm, labels)
 
   def plot(self):
     """Plot the confusion matrix."""
-    figure_data = \
-    {
-      "data": [
-        {
-          "x": self._all_labels,
-          "y": self._all_labels,
-          "z": self._data,
-          "colorscale": "YlGnBu",
-          "type": "heatmap"
-        }
-      ],
-      "layout": {
-        "title": "Confusion Matrix",
-        "xaxis": {
-          "title": "Predicted value",
-        },
-        "yaxis": {
-          "title": "True Value",
-        }
-      }
-    }
-    iplot(figure_data)
+
+    plt.imshow(self._cm, interpolation='nearest', cmap=plt.cm.Blues)
+    plt.title('Confusion matrix')
+    plt.colorbar()
+    tick_marks = np.arange(len(self._labels))
+    plt.xticks(tick_marks, self._labels, rotation=45)
+    plt.yticks(tick_marks, self._labels)
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+  
diff --git a/datalab/mlalpha/commands/_ml.py b/datalab/mlalpha/commands/_ml.py
@@ -18,11 +18,9 @@
 
 import collections
 import google.cloud.ml as cloudml
-import matplotlib.pyplot as plt
 import numpy as np
 import os
 import pandas as pd
-from sklearn.metrics import confusion_matrix
 import yaml
 
 
@@ -93,23 +91,6 @@ def ml(line, cell=None):
                                     required=True)
   batch_predict_parser.set_defaults(func=_batch_predict)
 
-  confusion_matrix_parser = parser.subcommand('confusion_matrix',
-                                              'Plot confusion matrix. The source is provided ' +
-                                              'in one of "csv", "bqtable", and "sql" params.')
-  confusion_matrix_parser.add_argument('--csv',
-                                       help='GCS or local path of CSV file which contains ' +
-                                            '"target", "predicted" columns at least. The CSV ' +
-                                            'either comes with a schema file in the same dir, ' +
-                                            'or specify "headers: name1, name2..." in cell.')
-  confusion_matrix_parser.add_argument('--bqtable',
-                                       help='name of the BigQuery table in the form of ' + 
-                                            'dataset.table.')
-  confusion_matrix_parser.add_argument('--sql',
-                                       help='name of the sql module defined in previous cell ' + 
-                                            'which should return "target", "predicted", ' +
-                                            'and "count" columns at least in results.')
-  confusion_matrix_parser.set_defaults(func=_confusion_matrix)
-
   namespace = datalab.utils.commands.notebook_environment()
   return datalab.utils.commands.handle_magic_line(line, cell, parser, namespace=namespace)
 
@@ -181,66 +162,3 @@ def _predict(args, cell):
 
 def _batch_predict(args, cell):
   return _run_package(args, cell, 'batch_predict')
-
-
-def _plot_confusion_matrix(cm, labels):
-  plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
-  plt.title('Confusion matrix')
-  plt.colorbar()
-  tick_marks = np.arange(len(labels))
-  plt.xticks(tick_marks, labels, rotation=45)
-  plt.yticks(tick_marks, labels)
-  plt.tight_layout()
-  plt.ylabel('True label')
-  plt.xlabel('Predicted label')
-
-
-def _confusion_matrix_from_csv(input_csv, cell):
-  schema_file = input_csv + '.schema.yaml'
-  headers = None
-  if cell is not None:
-    env = datalab.utils.commands.notebook_environment()
-    config = datalab.utils.commands.parse_config(cell, env)
-    headers_str = config.get('headers', None)
-    if headers_str is not None:
-      headers = [x.strip() for x in headers_str.split(',')]
-  if headers is not None:
-    with cloudml.util._file.open_local_or_gcs(input_csv, mode='r') as f:
-      df = pd.read_csv(f, names=headers)
-  elif cloudml.util._file.file_exists(schema_file):
-    df = datalab.mlalpha.csv_to_dataframe(input_csv, schema_file)
-  else:
-    raise Exception('headers is missing from cell, ' +
-                    'and there is no schema file in the same dir as csv')
-  labels = sorted(set(df['target']) | set(df['predicted']))
-  cm = confusion_matrix(df['target'], df['predicted'], labels=labels)
-  return cm, labels
-
-
-def _confusion_matrix_from_query(sql_module_name, bq_table):
-  if sql_module_name is not None:
-    item = datalab.utils.commands.get_notebook_item(sql_module_name)
-    query, _ = datalab.data.SqlModule.get_sql_statement_with_environment(item, {})
-  else:
-    query = ('select target, predicted, count(*) as count from %s group by target, predicted'
-             % bq_table)
-  dfbq = datalab.bigquery.Query(query).results().to_dataframe()
-  labels = sorted(set(dfbq['target']) | set(dfbq['predicted']))
-  labels_count = len(labels)
-  dfbq['target'] = [labels.index(x) for x in dfbq['target']]
-  dfbq['predicted'] = [labels.index(x) for x in dfbq['predicted']]
-  cm = [[0]*labels_count for i in range(labels_count)]
-  for index, row in dfbq.iterrows():
-    cm[row['target']][row['predicted']] = row['count']
-  return cm, labels
-
-
-def _confusion_matrix(args, cell):
-  if args['csv'] is not None:
-    #TODO: Maybe add cloud run for large CSVs with federated table.
-    cm, labels = _confusion_matrix_from_csv(args['csv'], cell)
-  elif args['sql'] is not None or args['bqtable'] is not None:
-    cm, labels = _confusion_matrix_from_query(args['sql'], args['bqtable'])
-  else:
-    raise Exception('One of "csv", "bqtable", and "sql" param is needed.')
-  _plot_confusion_matrix(cm, labels)