Substra · SdgJlbl · Feb 16, 2023 · Feb 15, 2023 · Feb 15, 2023 · Feb 15, 2023
@@ -8,17 +8,17 @@
 
 
 class TitanicOpener(tools.Opener):
-
     def get_data(self, folders):
         # find csv files
-        paths = []
-        for folder in folders:
-            paths += [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".csv")]
+        paths = [
+            os.path.join(folder, f)
+            for folder in folders
+            for f in os.listdir(folder)
+            if f.endswith(".csv")
+        ]
 
         # load data
-        data = pd.DataFrame()
-        for path in paths:
-            data = pd.concat([data, pd.read_csv(path)])
+        data = pd.concat([pd.read_csv(path) for path in paths])
 
         return data
 
@@ -39,4 +39,4 @@ def fake_data(self, n_samples=None):
             "Cabin": ["".join(random.sample(string.ascii_letters, 3)) for k in range(N_SAMPLES)],
             "Embarked": [random.choice(["C", "S", "Q"]) for k in range(N_SAMPLES)],
         }
-        return pd.DataFrame(data)
+        return pd.DataFrame(data)
@@ -2,7 +2,7 @@
 FROM ghcr.io/substra/substra-tools:0.20.0-nvidiacuda11.8.0-base-ubuntu22.04-python3.9
 
 # install dependencies
-RUN pip3 install pandas numpy 'scikit-learn==0.24.2' pillow scipy keras
+RUN pip3 install pandas numpy 'scikit-learn==1.1.1'
 
 # add your function script to docker image
 ADD titanic_function_rf.py .

@@ -9,7 +9,6 @@
 
 @tools.register
 def train(inputs, outputs, task_properties):
-
     X = inputs["datasamples"].drop(columns="Survived")
     y = inputs["datasamples"].Survived
     X = _normalize_X(X)
@@ -64,8 +63,7 @@ def save_model(model, path):
 
 
 def save_predictions(y_pred, path):
-    with open(path, "w") as f:
-        y_pred.to_csv(f, index=False)
+    y_pred.to_csv(path, index=False)
 
 
 def _normalize_X(X):

@@ -2,7 +2,7 @@
 FROM ghcr.io/substra/substra-tools:0.20.0-nvidiacuda11.8.0-base-ubuntu22.04-python3.9
 
 # install dependencies
-RUN pip3 install pandas numpy 'scikit-learn==0.24.2' pillow scipy keras
+RUN pip3 install pandas numpy 'scikit-learn==1.1.1'
 
 # add your function script to docker image
 ADD titanic_function_rf.py .

@@ -5,7 +5,6 @@
 
 @tools.register
 def score(inputs, outputs, task_properties):
-
     y_true = inputs["datasamples"].Survived.values
     y_pred = load_predictions(inputs["predictions"])
 

@@ -1,3 +1,5 @@
 matplotlib==3.6.3
 scikit-learn==1.1.1
 pandas==1.5.3
+substra
+substratools
@@ -45,12 +45,12 @@
 
 import substra
 from substra.sdk.schemas import (
-    FunctionSpec,
-    FunctionInputSpec,
-    FunctionOutputSpec,
     AssetKind,
     DataSampleSpec,
     DatasetSpec,
+    FunctionSpec,
+    FunctionInputSpec,
+    FunctionOutputSpec,
     Permissions,
     TaskSpec,
     ComputeTaskOutputSpec,
@@ -61,7 +61,7 @@
 # Instantiating the Substra Client
 # ================================
 #
-# The client allows us to interact with the Substra platform. Setting the debug argument to ``True`` allows us to work locally by emulating a platform.
+# The client allows us to interact with the Substra platform.
 #
 # By setting the argument ``backend_type`` to:
 #
@@ -106,7 +106,7 @@
 #
 # A dataset represents the data in Substra. It is made up of an opener, which is a script used to load the
 # data from files into memory. You can find more details about datasets
-# in the `API reference <api_reference.html#sdk-reference>`_
+# in the :ref:`API reference<documentation/api_reference:SDK Reference>`
 
 dataset = DatasetSpec(
     name="Titanic dataset - Org 1",
@@ -154,7 +154,6 @@
     )
 )
 
-# %%
 print(f"{len(test_data_sample_keys)} data samples were registered")
 
 
@@ -207,15 +206,13 @@
 # %%
 # Adding Function
 # ===============
-# A function specifies the method to train a model on a dataset or the method to aggregate models.
+# A :ref:`documentation/concepts:Function` specifies the method to train a model on a dataset or the method to aggregate models.
 # Concretely, a function corresponds to an archive (tar or zip file) containing:
 #
-# - One or more Python scripts that implement the function. Importantly, a train and a
-#   predict function have to be defined.
-# - A Dockerfile on which the user can specify the required dependencies of the Python scripts.
-#   This dockerfile also specifies the method name to execute (either train or predict here).
+# - One or more Python scripts that implement the function. It is required to define ``train`` and ``predict`` functions.
+# - A Dockerfile in which the user can specify the required dependencies of the Python scripts.
+#   This Dockerfile also specifies the method name to execute (either ``train`` or ``predict`` here).
 
-ALGO_KEYS_JSON_FILENAME = "function_random_forest_keys.json"
 
 ALGO_TRAIN_DOCKERFILE_FILES = [
     assets_directory / "function_random_forest/titanic_function_rf.py",
@@ -249,7 +246,7 @@
 print(f"Train function key {train_function_key}")
 
 # %%
-# The predict function uses the Python file as the function used for training.
+# The predict function uses the same Python file as the function used for training.
 ALGO_PREDICT_DOCKERFILE_FILES = [
     assets_directory / "function_random_forest/titanic_function_rf.py",
     assets_directory / "function_random_forest/predict/Dockerfile",
@@ -282,7 +279,7 @@
 print(f"Predict function key {predict_function_key}")
 
 # %%
-# The data, the function and the metric are now registered.
+# The data, the functions and the metric are now registered.
 
 # %%
 # Registering tasks
@@ -312,7 +309,7 @@
 #
 # In deployed mode, the registered task is added to a queue and treated asynchronously: this means that the
 # code that registers the tasks keeps executing. To wait for a task to be done, create a loop and get the task
-# every n seconds until its status is done or failed.
+# every ``n`` seconds until its status is done or failed.
 
 model_input = [
     InputRef(