Move job, models, and feature_slice_view plotting to API. (#167)

qimingj · qimingj · commit 6b61f15ecf72 · 2017-02-13T14:14:14.000-08:00
* Move job, models, and feature_slice_view plotting to API.

* Follow up on CR comments.
diff --git a/datalab/mlalpha/__init__.py b/datalab/mlalpha/__init__.py
@@ -19,7 +19,7 @@
 from ._metadata import Metadata
 from ._local_predictor import LocalPredictor
 from ._cloud_predictor import CloudPredictor
-from ._job import Jobs
+from ._job import Jobs, Job
 from ._summary import Summary
 from ._tensorboard import TensorBoard
 from ._dataset import CsvDataSet, BigQueryDataSet
@@ -28,6 +28,7 @@
 from ._confusion_matrix import ConfusionMatrix
 from ._analysis import csv_to_dataframe
 from ._package_runner import PackageRunner
+from ._feature_slice_view import FeatureSliceView
 
 from plotly.offline import init_notebook_mode
 
diff --git a/datalab/mlalpha/_cloud_models.py b/datalab/mlalpha/_cloud_models.py
@@ -15,6 +15,7 @@
 from googleapiclient import discovery
 import os
 import time
+import yaml
 
 import datalab.context
 import datalab.storage
@@ -29,38 +30,31 @@
 class CloudModels(object):
   """Represents a list of Cloud ML models for a project."""
 
-  def __init__(self, project_id=None, credentials=None, api=None):
+  def __init__(self, project_id=None):
     """Initializes an instance of a CloudML Model list that is iteratable
            ("for model in CloudModels()").
 
     Args:
       project_id: project_id of the models. If not provided, default project_id will be used.
-      credentials: credentials used to talk to CloudML service. If not provided, default credentials
-          will be used.
-      api: an optional CloudML API client.
     """
     if project_id is None:
       project_id = datalab.context.Context.default().project_id
     self._project_id = project_id
-    if credentials is None:
-      credentials = datalab.context.Context.default().credentials
-    self._credentials = credentials
-    if api is None:
-      api = discovery.build('ml', 'v1beta1', credentials=self._credentials,
-                            discoveryServiceUrl=_CLOUDML_DISCOVERY_URL)
-    self._api = api
+    self._credentials = datalab.context.Context.default().credentials
+    self._api = discovery.build('ml', 'v1alpha3', credentials=self._credentials,
+                                discoveryServiceUrl=_CLOUDML_DISCOVERY_URL)
 
   def _retrieve_models(self, page_token, page_size):
-    list_info = self._api.projects().models().list(parent='projects/' + self._project_id,
-                                                   pageToken=page_token, pageSize=page_size).execute()
+    list_info = self._api.projects().models().list(
+        parent='projects/' + self._project_id, pageToken=page_token, pageSize=page_size).execute()
     models = list_info.get('models', [])
     page_token = list_info.get('nextPageToken', None)
     return models, page_token
 
   def __iter__(self):
     return iter(datalab.utils.Iterator(self._retrieve_models))
 
-  def get(self, model_name):
+  def get_model_details(self, model_name):
     """Get details of a model.
 
     Args:
@@ -95,11 +89,42 @@ def delete(self, model_name):
       full_name = ('projects/%s/models/%s' % (self._project_id, model_name))
     return self._api.projects().models().delete(name=full_name).execute()
 
+  def list(self, count=10):
+    """List models under the current project in a table view.
+
+    Args:
+      count: upper limit of the number of models to list.
+    Raises:
+      Exception if it is called in a non-IPython environment.
+    """
+    import IPython
+    data = []
+    # Add range(count) to loop so it will stop either it reaches count, or iteration
+    # on self is exhausted. "self" is iterable (see __iter__() method).
+    for _, model in zip(range(count), self):
+      element = {'name': model['name']}
+      if 'defaultVersion' in model:
+        version_short_name = model['defaultVersion']['name'].split('/')[-1]
+        element['defaultVersion'] = version_short_name
+      data.append(element)
+
+    IPython.display.display(
+        datalab.utils.commands.render_dictionary(data, ['name', 'defaultVersion']))
+    
+  def describe(self, model_name):
+    """Print information of a specified model.
+
+    Args:
+      model_name: the name of the model to print details on.
+    """    
+    model_yaml = yaml.safe_dump(self.get_model_details(model_name), default_flow_style=False)
+    print model_yaml
+    
 
 class CloudModelVersions(object):
   """Represents a list of versions for a Cloud ML model."""
 
-  def __init__(self, model_name, project_id=None, credentials=None, api=None):
+  def __init__(self, model_name, project_id=None):
     """Initializes an instance of a CloudML model version list that is iteratable
         ("for version in CloudModelVersions()").
 
@@ -108,20 +133,12 @@ def __init__(self, model_name, project_id=None, credentials=None, api=None):
           ("projects/[project_id]/models/[model_name]") or just [model_name].
       project_id: project_id of the models. If not provided and model_name is not a full name
           (not including project_id), default project_id will be used.
-      credentials: credentials used to talk to CloudML service. If not provided, default
-          credentials will be used.
-      api: an optional CloudML API client.
     """
     if project_id is None:
-      project_id = datalab.context.Context.default().project_id
-    self._project_id = project_id
-    if credentials is None:
-      credentials = datalab.context.Context.default().credentials
-    self._credentials = credentials
-    if api is None:
-      api = discovery.build('ml', 'v1alpha3', credentials=self._credentials,
-                            discoveryServiceUrl=_CLOUDML_DISCOVERY_URL)
-    self._api = api
+      self._project_id = datalab.context.Context.default().project_id
+    self._credentials = datalab.context.Context.default().credentials
+    self._api = discovery.build('ml', 'v1alpha3', credentials=self._credentials,
+                                discoveryServiceUrl=_CLOUDML_DISCOVERY_URL)
     if not model_name.startswith('projects/'):
       model_name = ('projects/%s/models/%s' % (self._project_id, model_name))
     self._full_model_name = model_name
@@ -138,7 +155,7 @@ def _retrieve_versions(self, page_token, page_size):
   def __iter__(self):
     return iter(datalab.utils.Iterator(self._retrieve_versions))
 
-  def get(self, version_name):
+  def get_version_details(self, version_name):
     """Get details of a version.
 
     Args:
@@ -205,3 +222,28 @@ def delete(self, version_name):
     name = ('%s/versions/%s' % (self._full_model_name, version_name))
     response = self._api.projects().models().versions().delete(name=name).execute()
     self._wait_for_long_running_operation(response)
+    
+  def describe(self, version_name):
+    """Print information of a specified model.
+
+    Args:
+      version: the name of the version in short form, such as "v1".
+    """
+    version_yaml = yaml.safe_dump(self.get_version_details(version_name),
+                                  default_flow_style=False)
+    print version_yaml
+
+  def list(self):
+    """List versions under the current model in a table view.
+
+    Raises:
+      Exception if it is called in a non-IPython environment.
+    """    
+    import IPython
+
+    # "self" is iterable (see __iter__() method).
+    data = [{'name': version['name'].split()[-1], 
+             'deploymentUri': version['deploymentUri'], 'createTime': version['createTime']}
+            for version in self]
+    IPython.display.display(
+        datalab.utils.commands.render_dictionary(data, ['name', 'deploymentUri', 'createTime'])) 
diff --git a/datalab/mlalpha/_feature_slice_view.py b/datalab/mlalpha/_feature_slice_view.py
@@ -0,0 +1,87 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+import json
+import pandas as pd
+from types import ModuleType
+
+import datalab.data
+import datalab.utils
+
+
+class FeatureSliceView(object):
+  """Represents A feature slice view."""
+
+  def _get_lantern_format(self, df):
+    """ Feature slice view browser expects data in the format of:
+          {"metricValues": {"count": 12, "accuracy": 1.0}, "feature": "species:Iris-setosa"}
+          {"metricValues": {"count": 11, "accuracy": 0.72}, "feature": "species:Iris-versicolor"}
+          ...
+        This function converts a DataFrame to such format.
+    """
+    
+    if ('count' not in df) or ('feature' not in df):
+      raise Exception('No "count" or "feature" found in data.')
+    if len(df.columns) < 3:
+      raise Exception('Need at least one metrics column.')      
+    if len(df) == 0:
+      raise Exception('Data is empty')
+
+    data = []
+    for _, row in df.iterrows():
+      metric_values = dict(row)
+      feature = metric_values.pop('feature')
+      data.append({'feature': feature, 'metricValues': metric_values})
+    return data
+  
+  def plot(self, data):
+    """ Plots a featire slice view on given data.
+
+    Args:
+      data: Can be one of:
+            A string of sql query.
+            A sql query module defined by "%%sql --module module_name".
+            A pandas DataFrame.
+          Regardless of data type, it must include the following columns:
+            "feature": identifies a slice of features. For example: "petal_length:4.0-4.2".
+            "count": number of instances in that slice of features.
+          All other columns are viewed as metrics for its feature slice. At least one is required.
+    """    
+    import IPython
+
+    if isinstance(data, ModuleType) or isinstance(data, basestring):
+      item, _ = datalab.data.SqlModule.get_sql_statement_with_environment(data, {})
+      query = datalab.bigquery.Query(item)
+      df = query.results().to_dataframe()
+      data = self._get_lantern_format(df)
+    elif isinstance(data, pd.core.frame.DataFrame):
+      data = self._get_lantern_format(data)
+    else:
+      raise Exception('data needs to be a sql query, or a pandas DataFrame.')
+      
+    HTML_TEMPLATE = """<link rel="import" href="/nbextensions/gcpdatalab/extern/lantern-browser.html" >
+        <lantern-browser id="{html_id}"></lantern-browser>
+        <script>
+        var browser = document.querySelector('#{html_id}');
+        browser.metrics = {metrics};
+        browser.data = {data};
+        browser.sourceType = 'colab';
+        browser.weightedExamplesColumn = 'count';
+        browser.calibrationPlotUriFn = function(s) {{ return '/' + s; }}
+        </script>"""
+    # Serialize the data and list of metrics names to JSON string.
+    metrics_str = str(map(str, data[0]['metricValues'].keys()))
+    data_str = str([{str(k): json.dumps(v) for k,v in elem.iteritems()} for elem in data])
+    html_id = 'l' + datalab.utils.commands.Html.next_id()
+    html = HTML_TEMPLATE.format(html_id=html_id, metrics=metrics_str, data=data_str)
+    IPython.display.display(IPython.display.HTML(html))
+
diff --git a/datalab/mlalpha/_job.py b/datalab/mlalpha/_job.py
@@ -12,10 +12,20 @@
 
 """Implements Cloud ML Operation wrapper."""
 
+
 import datalab.utils
 import datalab.context
 from googleapiclient import discovery
+import yaml
+
+# TODO(qimingj) Remove once the API is public since it will no longer be needed
+_CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \
+                         'ml_v1beta1_discovery.json'
 
+import datalab.utils
+import datalab.context
+from googleapiclient import discovery
+import yaml
 
 # TODO(qimingj) Remove once the API is public since it will no longer be needed
 _CLOUDML_DISCOVERY_URL = 'https://storage.googleapis.com/cloud-ml/discovery/' \
@@ -54,26 +64,26 @@ def refresh(self):
     """ Refresh the job info. """
     self._info = self._api.projects().jobs().get(name=self._name).execute()
 
+  def describe(self):
+    job_yaml = yaml.safe_dump(self._info, default_flow_style=False)
+    print job_yaml
+
 
 class Jobs(object):
   """Represents a list of Cloud ML jobs for a project."""
 
-  def __init__(self, filter=None, context=None, api=None):
+  def __init__(self, filter=None):
     """Initializes an instance of a CloudML Job list that is iteratable ("for job in jobs()").
 
     Args:
-      filter: filter string for retrieving jobs. Currently only "done=true|false" is supported.
+      filter: filter string for retrieving jobs, such as "state=FAILED"
       context: an optional Context object providing project_id and credentials.
       api: an optional CloudML API client.
     """
     self._filter = filter
-    if context is None:
-      context = datalab.context.Context.default()
-    self._context = context
-    if api is None:
-      api = discovery.build('ml', 'v1beta1', credentials=self._context.credentials,
-                            discoveryServiceUrl=_CLOUDML_DISCOVERY_URL)
-    self._api = api
+    self._context = datalab.context.Context.default()
+    self._api = discovery.build('ml', 'v1beta1', credentials=self._context.credentials,
+                                discoveryServiceUrl=_CLOUDML_DISCOVERY_URL)
 
   def _retrieve_jobs(self, page_token, page_size):
     list_info = self._api.projects().jobs().list(parent='projects/' + self._context.project_id,
@@ -86,10 +96,10 @@ def _retrieve_jobs(self, page_token, page_size):
   def __iter__(self):
     return iter(datalab.utils.Iterator(self._retrieve_jobs))
 
-  def get_job_by_name(self, name):
-    """ get a CloudML job by its name.
-    Args:
-      name: the name of the job. See "Job" class constructor.
-    """
-    return Job(name, self._context, self._api)
-
+  def list(self, count=10):
+    import IPython
+    data = [{'Id': job['jobId'], 'State': job.get('state', 'UNKNOWN'),
+             'createTime': job['createTime']}
+            for _, job in zip(range(count), self)]
+    IPython.display.display(
+        datalab.utils.commands.render_dictionary(data, ['Id', 'State', 'createTime']))
diff --git a/datalab/mlalpha/commands/_mlalpha.py b/datalab/mlalpha/commands/_mlalpha.py
@@ -618,7 +618,7 @@ def _model(args, _):
     return
   elif len(parts) == 2:
     versions = datalab.mlalpha.CloudModelVersions(parts[0], project_id=args['project'])
-    version_yaml = yaml.safe_dump(versions.get(parts[1]))
+    version_yaml = yaml.safe_dump(versions.get_version_details(parts[1]))
     return datalab.utils.commands.render_text(version_yaml, preformatted=True)
   else:
     raise Exception('Too many "." in name. Use "model" or "model.version".')