ENH Add "targets_to_predict" param to ModelPipeline.predict

Stephen Hoover · Stephen Hoover · commit 482fc9e50384 · 2018-03-07T12:35:35.000-06:00
CivisML v2.2 will add the ability for users to subset model predictions, as a way to save time and space. Add this parameter to `ModelPipeline.predict`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - Executors in ``futures`` (and the joblib backend, which uses them) will now
   add "CIVIS_PARENT_JOB_ID" and "CIVIS_PARENT_RUN_ID" environment variables
   to the child jobs they create (#236)
+- Added a new parameter ``targets_to_predict`` to ``civis.ml.ModelPipeline.predict``.
+  This allows users to select a subset of a model's outputs for scoring (#241).
 
 ### Changed
 - Moved "Optional Dependencies" doc section to top of ML docs, and
diff --git a/civis/ml/_model.py b/civis/ml/_model.py
@@ -738,7 +738,7 @@ def __init__(self, model, dependent_variable,
                  etl=None):
         self.model = model
         self._input_model = model  # In case we need to modify the input
-        if isinstance(dependent_variable, str):
+        if isinstance(dependent_variable, six.string_types):
             # Standardize the dependent variable as a list.
             dependent_variable = [dependent_variable]
         self.dependent_variable = dependent_variable
@@ -1133,7 +1133,8 @@ def predict(self, df=None, csv_path=None,
                 manifest=None, file_id=None, sql_where=None, sql_limit=None,
                 primary_key=SENTINEL, output_table=None, output_db=None,
                 if_exists='fail', n_jobs=None, polling_interval=None,
-                cpu=None, memory=None, disk_space=None):
+                cpu=None, memory=None, disk_space=None,
+                targets_to_predict=None):
         """Make predictions on a trained model
 
         Provide input through one of
@@ -1219,6 +1220,13 @@ def predict(self, df=None, csv_path=None,
             RAM requested by the user for a single job.
         disk_space : float, optional
             disk space requested by the user for a single job.
+        targets_to_predict : list of str, optional
+            If this is a multi-output model, you may list a subset of
+            targets for which you wish to generate predictions.
+            This list must be a subset of the original `dependent_variable`
+            input. Ignoring some of the model's outputs will let predictions
+            complete faster and use less disk space. The default is to
+            produce scores for all targets.
 
         Returns
         -------
@@ -1265,6 +1273,12 @@ def predict(self, df=None, csv_path=None,
             predict_args['LIMITSQL'] = sql_limit
         if n_jobs:
             predict_args['N_JOBS'] = n_jobs
+        if targets_to_predict:
+            if isinstance(targets_to_predict, six.string_types):
+                targets_to_predict = [targets_to_predict]
+            if self.predict_template_id > 10600:
+                # This feature was added in v2.2.
+                predict_args['TARGET_COLUMN'] = ' '.join(targets_to_predict)
         if self.predict_template_id >= 9969:
             if cpu:
                 predict_args['CPU'] = cpu