cortexlabs · deliahu · Jun 13, 2019 · Jun 13, 2019 · Jun 13, 2019 · Jun 13, 2019
diff --git a/examples/mnist/implementations/models/dnn.py b/examples/mnist/implementations/models/dnn.py
@@ -4,7 +4,7 @@
 def create_estimator(run_config, model_config):
     feature_columns = [
         tf.feature_column.numeric_column(
-            "image_pixels", shape=model_config["hparams"]["input_shape"]
+            model_config["input"], shape=model_config["hparams"]["input_shape"]
         )
     ]
 

diff --git a/examples/mnist/implementations/transformers/decode_and_normalize.py b/examples/mnist/implementations/transformers/decode_and_normalize.py
@@ -5,10 +5,8 @@
 import math
 
 
-def transform_python(sample, args):
-    image = sample["image"]
-
-    decoded = base64.b64decode(image)
+def transform_python(input):
+    decoded = base64.b64decode(input)
     decoded_image = np.asarray(Image.open(BytesIO(decoded)), dtype=np.uint8)
 
     # reimplmenting tf.per_image_standardization

diff --git a/examples/mnist/resources/apis.yaml b/examples/mnist/resources/apis.yaml
@@ -1,17 +1,17 @@
 - kind: api
   name: dnn-classifier
-  model_name: dnn
+  model: @dnn
   compute:
     replicas: 1
 
 - kind: api
   name: conv-classifier
-  model_name: conv
+  model: @conv
   compute:
     replicas: 1
 
 - kind: api
   name: t2t-classifier
-  model_name: t2t
+  model: @t2t
   compute:
     replicas: 1
diff --git a/examples/mnist/resources/data.yaml b/examples/mnist/resources/data.yaml
@@ -0,0 +1,25 @@
+- kind: environment
+  name: dev
+  data:
+    type: csv
+    path: s3a://cortex-examples/mnist.csv
+    csv_config:
+      header: true
+    schema: [@image, @label]
+
+- kind: raw_column
+  name: image
+  type: STRING_COLUMN
+  required: true
+
+- kind: raw_column
+  name: label
+  type: INT_COLUMN
+  required: true
+  min: 0
+  max: 9
+
+- kind: transformed_column
+  name: image_pixels
+  transformer_path: implementations/transformers/decode_and_normalize.py
+  input: @image
diff --git a/examples/mnist/resources/environments.yaml b/examples/mnist/resources/environments.yaml
diff --git a/examples/mnist/resources/models.yaml b/examples/mnist/resources/models.yaml
@@ -1,26 +1,21 @@
 - kind: model
   name: dnn
-  path: implementations/models/dnn.py
-  type: classification
-  target_column: label
-  feature_columns:
-    - image_pixels
+  estimator_path: implementations/models/dnn.py
+  target_column: @label
+  input: @image_pixels
   hparams:
-    learning_rate: 0.01
     input_shape: [784]
-    output_shape: [10]
+    learning_rate: 0.01
     hidden_units: [100, 200]
   data_partition_ratio:
     training: 0.7
     evaluation: 0.3
 
 - kind: model
   name: conv
-  path: implementations/models/custom.py
-  type: classification
-  target_column: label
-  feature_columns:
-    - image_pixels
+  estimator_path: implementations/models/custom.py
+  target_column: @label
+  input: @image_pixels
   hparams:
     layer_type: conv
     learning_rate: 0.01
@@ -38,11 +33,9 @@
 
 - kind: model
   name: t2t
-  path: implementations/models/t2t.py
-  type: classification
-  target_column: label
-  feature_columns:
-    - image_pixels
+  estimator_path: implementations/models/t2t.py
+  target_column: @label
+  input: @image_pixels
   prediction_key: outputs
   hparams:
     input_shape: [28, 28, 1]

diff --git a/examples/mnist/resources/raw_columns.yaml b/examples/mnist/resources/raw_columns.yaml
diff --git a/examples/mnist/resources/transformed_columns.yaml b/examples/mnist/resources/transformed_columns.yaml
diff --git a/examples/movie-ratings/implementations/models/basic_embedding.py b/examples/movie-ratings/implementations/models/basic_embedding.py
@@ -2,26 +2,18 @@
 
 
 def create_estimator(run_config, model_config):
-    user_id_index = model_config["aggregates"]["user_id_index"]
-    movie_id_index = model_config["aggregates"]["movie_id_index"]
-
-    feature_columns = [
-        tf.feature_column.embedding_column(
-            tf.feature_column.categorical_column_with_identity(
-                "user_id_indexed", len(user_id_index)
-            ),
-            model_config["hparams"]["embedding_size"],
-        ),
-        tf.feature_column.embedding_column(
+    embedding_feature_columns = []
+    for feature_col_data in model_config["input"]["embedding_columns"]:
+        embedding_col = tf.feature_column.embedding_column(
             tf.feature_column.categorical_column_with_identity(
-                "movie_id_indexed", len(movie_id_index)
+                feature_col_data["col"], len(feature_col_data["vocab"]["index"])
             ),
             model_config["hparams"]["embedding_size"],
-        ),
-    ]
+        )
+        embedding_feature_columns.append(embedding_col)
 
     return tf.estimator.DNNRegressor(
-        feature_columns=feature_columns,
+        feature_columns=embedding_feature_columns,
         hidden_units=model_config["hparams"]["hidden_units"],
         config=run_config,
     )
diff --git a/examples/movie-ratings/resources/apis.yaml b/examples/movie-ratings/resources/apis.yaml
@@ -1,5 +1,5 @@
 - kind: api
   name: ratings
-  model_name: basic_embedding
+  model: @basic_embedding
   compute:
     replicas: 1
diff --git a/examples/movie-ratings/resources/data.yaml b/examples/movie-ratings/resources/data.yaml
@@ -0,0 +1,44 @@
+- kind: environment
+  name: dev
+  data:
+    type: csv
+    path: s3a://cortex-examples/movie-ratings.csv
+    csv_config:
+      header: true
+    schema: [@user_id, @movie_id, @rating, @timestamp]
+
+- kind: raw_column
+  name: user_id
+  type: STRING_COLUMN
+
+- kind: raw_column
+  name: movie_id
+  type: STRING_COLUMN
+
+- kind: raw_column
+  name: rating
+  type: FLOAT_COLUMN
+
+- kind: aggregate
+  name: user_id_index
+  aggregator: cortex.index_string
+  input: @user_id
+
+- kind: transformed_column
+  name: user_id_indexed
+  transformer: cortex.index_string
+  input:
+    col: @user_id
+    indexes: @user_id_index
+
+- kind: aggregate
+  name: movie_id_index
+  aggregator: cortex.index_string
+  input: @movie_id
+
+- kind: transformed_column
+  name: movie_id_indexed
+  transformer: cortex.index_string
+  input:
+    col: @movie_id
+    indexes: @movie_id_index
diff --git a/examples/movie-ratings/resources/environments.yaml b/examples/movie-ratings/resources/environments.yaml
diff --git a/examples/movie-ratings/resources/models.yaml b/examples/movie-ratings/resources/models.yaml
@@ -1,12 +1,16 @@
 - kind: model
   name: basic_embedding
-  type: regression
-  target_column: rating
-  feature_columns: [user_id_indexed, movie_id_indexed]
-  aggregates: [user_id_index, movie_id_index]
+  estimator_path: implementations/models/basic_embedding.py
+  target_column: @rating
+  input:
+    embedding_columns:
+      - col: @user_id_indexed
+        vocab: @user_id_index
+      - col: @movie_id_indexed
+        vocab: @movie_id_index
   hparams:
-    embedding_size: 10
-    hidden_units: [128]
+    embedding_size: 20
+    hidden_units: [10, 10]
   data_partition_ratio:
     training: 0.8
     evaluation: 0.2

diff --git a/examples/movie-ratings/resources/transformed_columns.yaml b/examples/movie-ratings/resources/transformed_columns.yaml
diff --git a/examples/reviews/implementations/aggregators/max_length.py b/examples/reviews/implementations/aggregators/max_length.py
@@ -1,9 +1,9 @@
-def aggregate_spark(data, columns, args):
+def aggregate_spark(data, input):
     from pyspark.ml.feature import RegexTokenizer
     import pyspark.sql.functions as F
     from pyspark.sql.types import IntegerType
 
-    regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W")
+    regexTokenizer = RegexTokenizer(inputCol=input, outputCol="token_list", pattern="\\W")
     regexTokenized = regexTokenizer.transform(data)
 
     max_review_length_row = (

diff --git a/examples/reviews/implementations/aggregators/vocab.py b/examples/reviews/implementations/aggregators/vocab.py
@@ -1,16 +1,16 @@
-def aggregate_spark(data, columns, args):
+def aggregate_spark(data, input):
     import pyspark.sql.functions as F
     from pyspark.ml.feature import RegexTokenizer
 
-    regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W")
+    regexTokenizer = RegexTokenizer(inputCol=input["col"], outputCol="token_list", pattern="\\W")
     regexTokenized = regexTokenizer.transform(data)
 
     vocab_rows = (
         regexTokenized.select(F.explode(F.col("token_list")).alias("word"))
         .groupBy("word")
         .count()
         .orderBy(F.col("count").desc())
-        .limit(args["vocab_size"])
+        .limit(input["vocab_size"])
         .select("word")
         .collect()
     )

diff --git a/examples/reviews/implementations/models/sentiment_dnn.py b/examples/reviews/implementations/models/sentiment_dnn.py
@@ -4,7 +4,7 @@
 
 def create_estimator(run_config, model_config):
     hparams = model_config["hparams"]
-    vocab_size = len(model_config["aggregates"]["reviews_vocab"])
+    vocab_size = len(model_config["input"]["vocab"])
 
     def model_fn(features, labels, mode, params):
         embedding_input = features["embedding_input"]

diff --git a/examples/reviews/implementations/models/sentiment_linear.py b/examples/reviews/implementations/models/sentiment_linear.py
@@ -2,7 +2,7 @@
 
 
 def create_estimator(run_config, model_config):
-    vocab_size = len(model_config["aggregates"]["reviews_vocab"])
+    vocab_size = len(model_config["input"]["vocab"])
     feature_column = tf.feature_column.categorical_column_with_identity(
         "embedding_input", vocab_size
     )

diff --git a/examples/reviews/implementations/models/transformer.py b/examples/reviews/implementations/models/transformer.py
@@ -13,7 +13,7 @@ def create_estimator(run_config, model_config):
     hparams = trainer_lib.create_hparams("transformer_base_single_gpu")
 
     # SentimentIMDBCortex subclasses SentimentIMDB
-    problem = SentimentIMDBCortex(list(model_config["aggregates"]["reviews_vocab"]))
+    problem = SentimentIMDBCortex(list(model_config["input"]["vocab"]))
     hparams.problem = problem
     hparams.problem_hparams = problem.get_hparams(hparams)
 
@@ -39,7 +39,7 @@ def create_estimator(run_config, model_config):
 
 
 def transform_tensorflow(features, labels, model_config):
-    max_length = model_config["aggregates"]["max_review_length"]
+    max_length = model_config["input"]["max_review_length"]
 
     features["inputs"] = tf.expand_dims(tf.reshape(features["embedding_input"], [max_length]), -1)
     features["targets"] = tf.expand_dims(tf.expand_dims(labels, -1), -1)