Skip to content

Commit

Permalink
chore: update titanic (#278)
Browse files Browse the repository at this point in the history
* Remove unused dependencies in Dockerfile

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>

* Remove mention to a debug argument that is not present in code

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>

* Add substra and substratools in requirements

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>

* Upgrade sklearn version pinned in Docker

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>

* Pythonicity improvements

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>

* Fix broken link + typos

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>

---------

Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>
  • Loading branch information
SdgJlbl authored Feb 16, 2023
1 parent dacbdc6 commit 3a62295
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 29 deletions.
16 changes: 8 additions & 8 deletions examples/titanic_example/assets/dataset/titanic_opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@


class TitanicOpener(tools.Opener):

def get_data(self, folders):
# find csv files
paths = []
for folder in folders:
paths += [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".csv")]
paths = [
os.path.join(folder, f)
for folder in folders
for f in os.listdir(folder)
if f.endswith(".csv")
]

# load data
data = pd.DataFrame()
for path in paths:
data = pd.concat([data, pd.read_csv(path)])
data = pd.concat([pd.read_csv(path) for path in paths])

return data

Expand All @@ -39,4 +39,4 @@ def fake_data(self, n_samples=None):
"Cabin": ["".join(random.sample(string.ascii_letters, 3)) for k in range(N_SAMPLES)],
"Embarked": [random.choice(["C", "S", "Q"]) for k in range(N_SAMPLES)],
}
return pd.DataFrame(data)
return pd.DataFrame(data)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FROM ghcr.io/substra/substra-tools:0.20.0-nvidiacuda11.8.0-base-ubuntu22.04-python3.9

# install dependencies
RUN pip3 install pandas numpy 'scikit-learn==0.24.2' pillow scipy keras
RUN pip3 install pandas numpy 'scikit-learn==1.1.1'

# add your function script to docker image
ADD titanic_function_rf.py .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

@tools.register
def train(inputs, outputs, task_properties):

X = inputs["datasamples"].drop(columns="Survived")
y = inputs["datasamples"].Survived
X = _normalize_X(X)
Expand Down Expand Up @@ -64,8 +63,7 @@ def save_model(model, path):


def save_predictions(y_pred, path):
with open(path, "w") as f:
y_pred.to_csv(f, index=False)
y_pred.to_csv(path, index=False)


def _normalize_X(X):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FROM ghcr.io/substra/substra-tools:0.20.0-nvidiacuda11.8.0-base-ubuntu22.04-python3.9

# install dependencies
RUN pip3 install pandas numpy 'scikit-learn==0.24.2' pillow scipy keras
RUN pip3 install pandas numpy 'scikit-learn==1.1.1'

# add your function script to docker image
ADD titanic_function_rf.py .
Expand Down
1 change: 0 additions & 1 deletion examples/titanic_example/assets/metric/titanic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

@tools.register
def score(inputs, outputs, task_properties):

y_true = inputs["datasamples"].Survived.values
y_pred = load_predictions(inputs["predictions"])

Expand Down
2 changes: 2 additions & 0 deletions examples/titanic_example/assets/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
matplotlib==3.6.3
scikit-learn==1.1.1
pandas==1.5.3
substra
substratools
27 changes: 12 additions & 15 deletions examples/titanic_example/run_titanic.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@

import substra
from substra.sdk.schemas import (
FunctionSpec,
FunctionInputSpec,
FunctionOutputSpec,
AssetKind,
DataSampleSpec,
DatasetSpec,
FunctionSpec,
FunctionInputSpec,
FunctionOutputSpec,
Permissions,
TaskSpec,
ComputeTaskOutputSpec,
Expand All @@ -61,7 +61,7 @@
# Instantiating the Substra Client
# ================================
#
# The client allows us to interact with the Substra platform. Setting the debug argument to ``True`` allows us to work locally by emulating a platform.
# The client allows us to interact with the Substra platform.
#
# By setting the argument ``backend_type`` to:
#
Expand Down Expand Up @@ -106,7 +106,7 @@
#
# A dataset represents the data in Substra. It is made up of an opener, which is a script used to load the
# data from files into memory. You can find more details about datasets
# in the `API reference <api_reference.html#sdk-reference>`_
# in the :ref:`API reference<documentation/api_reference:SDK Reference>`

dataset = DatasetSpec(
name="Titanic dataset - Org 1",
Expand Down Expand Up @@ -154,7 +154,6 @@
)
)

# %%
print(f"{len(test_data_sample_keys)} data samples were registered")


Expand Down Expand Up @@ -207,15 +206,13 @@
# %%
# Adding Function
# ===============
# A function specifies the method to train a model on a dataset or the method to aggregate models.
# A :ref:`documentation/concepts:Function` specifies the method to train a model on a dataset or the method to aggregate models.
# Concretely, a function corresponds to an archive (tar or zip file) containing:
#
# - One or more Python scripts that implement the function. Importantly, a train and a
# predict function have to be defined.
# - A Dockerfile on which the user can specify the required dependencies of the Python scripts.
# This dockerfile also specifies the method name to execute (either train or predict here).
# - One or more Python scripts that implement the function. It is required to define ``train`` and ``predict`` functions.
# - A Dockerfile in which the user can specify the required dependencies of the Python scripts.
# This Dockerfile also specifies the method name to execute (either ``train`` or ``predict`` here).

ALGO_KEYS_JSON_FILENAME = "function_random_forest_keys.json"

ALGO_TRAIN_DOCKERFILE_FILES = [
assets_directory / "function_random_forest/titanic_function_rf.py",
Expand Down Expand Up @@ -249,7 +246,7 @@
print(f"Train function key {train_function_key}")

# %%
# The predict function uses the Python file as the function used for training.
# The predict function uses the same Python file as the function used for training.
ALGO_PREDICT_DOCKERFILE_FILES = [
assets_directory / "function_random_forest/titanic_function_rf.py",
assets_directory / "function_random_forest/predict/Dockerfile",
Expand Down Expand Up @@ -282,7 +279,7 @@
print(f"Predict function key {predict_function_key}")

# %%
# The data, the function and the metric are now registered.
# The data, the functions and the metric are now registered.

# %%
# Registering tasks
Expand Down Expand Up @@ -312,7 +309,7 @@
#
# In deployed mode, the registered task is added to a queue and treated asynchronously: this means that the
# code that registers the tasks keeps executing. To wait for a task to be done, create a loop and get the task
# every n seconds until its status is done or failed.
# every ``n`` seconds until its status is done or failed.

model_input = [
InputRef(
Expand Down

0 comments on commit 3a62295

Please sign in to comment.