ENH Add "dvs_to_predict" param to ModelPipeline.predict (#241)

Stephen Hoover · web-flow · commit 664f6b3efb1c · 2018-03-07T15:07:39.000-06:00
CivisML v2.2 will add the ability for users to subset model predictions, as a way to save time and space. Add this parameter to `ModelPipeline.predict`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - Executors in ``futures`` (and the joblib backend, which uses them) will now
   add "CIVIS_PARENT_JOB_ID" and "CIVIS_PARENT_RUN_ID" environment variables
   to the child jobs they create (#236)
+- Added a new parameter ``dvs_to_predict`` to ``civis.ml.ModelPipeline.predict``.
+  This allows users to select a subset of a model's outputs for scoring (#241).
 
 ### Changed
 - Moved "Optional Dependencies" doc section to top of ML docs, and
diff --git a/civis/ml/_model.py b/civis/ml/_model.py
@@ -738,7 +738,7 @@ def __init__(self, model, dependent_variable,
                  etl=None):
         self.model = model
         self._input_model = model  # In case we need to modify the input
-        if isinstance(dependent_variable, str):
+        if isinstance(dependent_variable, six.string_types):
             # Standardize the dependent variable as a list.
             dependent_variable = [dependent_variable]
         self.dependent_variable = dependent_variable
@@ -1133,7 +1133,8 @@ def predict(self, df=None, csv_path=None,
                 manifest=None, file_id=None, sql_where=None, sql_limit=None,
                 primary_key=SENTINEL, output_table=None, output_db=None,
                 if_exists='fail', n_jobs=None, polling_interval=None,
-                cpu=None, memory=None, disk_space=None):
+                cpu=None, memory=None, disk_space=None,
+                dvs_to_predict=None):
         """Make predictions on a trained model
 
         Provide input through one of
@@ -1219,6 +1220,15 @@ def predict(self, df=None, csv_path=None,
             RAM requested by the user for a single job.
         disk_space : float, optional
             disk space requested by the user for a single job.
+        dvs_to_predict : list of str, optional
+            If this is a multi-output model, you may list a subset of
+            dependent variables for which you wish to generate predictions.
+            This list must be a subset of the original `dependent_variable`
+            input. The scores for the returned subset will be identical to
+            the scores which those outputs would have had if all outputs
+            were written, but ignoring some of the model's outputs will
+            let predictions complete faster and use less disk space.
+            The default is to produce scores for all DVs.
 
         Returns
         -------
@@ -1265,6 +1275,12 @@ def predict(self, df=None, csv_path=None,
             predict_args['LIMITSQL'] = sql_limit
         if n_jobs:
             predict_args['N_JOBS'] = n_jobs
+        if dvs_to_predict:
+            if isinstance(dvs_to_predict, six.string_types):
+                dvs_to_predict = [dvs_to_predict]
+            if self.predict_template_id > 10583:
+                # This feature was added in v2.2; 10583 is the v2.1 template
+                predict_args['TARGET_COLUMN'] = ' '.join(dvs_to_predict)
         if self.predict_template_id >= 9969:
             if cpu:
                 predict_args['CPU'] = cpu