Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: update titanic #278

Merged
merged 6 commits into from
Feb 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions examples/titanic_example/assets/dataset/titanic_opener.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@


class TitanicOpener(tools.Opener):

def get_data(self, folders):
# find csv files
paths = []
for folder in folders:
paths += [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".csv")]
paths = [
os.path.join(folder, f)
for folder in folders
for f in os.listdir(folder)
if f.endswith(".csv")
]

# load data
data = pd.DataFrame()
for path in paths:
data = pd.concat([data, pd.read_csv(path)])
data = pd.concat([pd.read_csv(path) for path in paths])

return data

Expand All @@ -39,4 +39,4 @@ def fake_data(self, n_samples=None):
"Cabin": ["".join(random.sample(string.ascii_letters, 3)) for k in range(N_SAMPLES)],
"Embarked": [random.choice(["C", "S", "Q"]) for k in range(N_SAMPLES)],
}
return pd.DataFrame(data)
return pd.DataFrame(data)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FROM ghcr.io/substra/substra-tools:0.20.0-nvidiacuda11.8.0-base-ubuntu22.04-python3.9

# install dependencies
RUN pip3 install pandas numpy 'scikit-learn==0.24.2' pillow scipy keras
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surprising to see Keras here 😅

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same 😅
It's not exactly lightweight either ^^

RUN pip3 install pandas numpy 'scikit-learn==1.1.1'

# add your function script to docker image
ADD titanic_function_rf.py .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

@tools.register
def train(inputs, outputs, task_properties):

X = inputs["datasamples"].drop(columns="Survived")
y = inputs["datasamples"].Survived
X = _normalize_X(X)
Expand Down Expand Up @@ -64,8 +63,7 @@ def save_model(model, path):


def save_predictions(y_pred, path):
with open(path, "w") as f:
y_pred.to_csv(f, index=False)
y_pred.to_csv(path, index=False)


def _normalize_X(X):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
FROM ghcr.io/substra/substra-tools:0.20.0-nvidiacuda11.8.0-base-ubuntu22.04-python3.9

# install dependencies
RUN pip3 install pandas numpy 'scikit-learn==0.24.2' pillow scipy keras
RUN pip3 install pandas numpy 'scikit-learn==1.1.1'

# add your function script to docker image
ADD titanic_function_rf.py .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

@tools.register
def score(inputs, outputs, task_properties):

y_true = inputs["datasamples"].Survived.values
y_pred = load_predictions(inputs["predictions"])

Expand Down
2 changes: 2 additions & 0 deletions examples/titanic_example/assets/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
matplotlib==3.6.3
scikit-learn==1.1.1
pandas==1.5.3
substra
substratools
27 changes: 12 additions & 15 deletions examples/titanic_example/run_titanic.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@

import substra
from substra.sdk.schemas import (
FunctionSpec,
FunctionInputSpec,
FunctionOutputSpec,
AssetKind,
DataSampleSpec,
DatasetSpec,
FunctionSpec,
FunctionInputSpec,
FunctionOutputSpec,
Permissions,
TaskSpec,
ComputeTaskOutputSpec,
Expand All @@ -61,7 +61,7 @@
# Instantiating the Substra Client
# ================================
#
# The client allows us to interact with the Substra platform. Setting the debug argument to ``True`` allows us to work locally by emulating a platform.
# The client allows us to interact with the Substra platform.
#
# By setting the argument ``backend_type`` to:
#
Expand Down Expand Up @@ -106,7 +106,7 @@
#
# A dataset represents the data in Substra. It is made up of an opener, which is a script used to load the
# data from files into memory. You can find more details about datasets
# in the `API reference <api_reference.html#sdk-reference>`_
# in the :ref:`API reference<documentation/api_reference:SDK Reference>`

dataset = DatasetSpec(
name="Titanic dataset - Org 1",
Expand Down Expand Up @@ -154,7 +154,6 @@
)
)

# %%
print(f"{len(test_data_sample_keys)} data samples were registered")


Expand Down Expand Up @@ -207,15 +206,13 @@
# %%
# Adding Function
# ===============
# A function specifies the method to train a model on a dataset or the method to aggregate models.
# A :ref:`documentation/concepts:Function` specifies the method to train a model on a dataset or the method to aggregate models.
# Concretely, a function corresponds to an archive (tar or zip file) containing:
#
# - One or more Python scripts that implement the function. Importantly, a train and a
# predict function have to be defined.
# - A Dockerfile on which the user can specify the required dependencies of the Python scripts.
# This dockerfile also specifies the method name to execute (either train or predict here).
# - One or more Python scripts that implement the function. It is required to define ``train`` and ``predict`` functions.
# - A Dockerfile in which the user can specify the required dependencies of the Python scripts.
# This Dockerfile also specifies the method name to execute (either ``train`` or ``predict`` here).

ALGO_KEYS_JSON_FILENAME = "function_random_forest_keys.json"

ALGO_TRAIN_DOCKERFILE_FILES = [
assets_directory / "function_random_forest/titanic_function_rf.py",
Expand Down Expand Up @@ -249,7 +246,7 @@
print(f"Train function key {train_function_key}")

# %%
# The predict function uses the Python file as the function used for training.
# The predict function uses the same Python file as the function used for training.
ALGO_PREDICT_DOCKERFILE_FILES = [
assets_directory / "function_random_forest/titanic_function_rf.py",
assets_directory / "function_random_forest/predict/Dockerfile",
Expand Down Expand Up @@ -282,7 +279,7 @@
print(f"Predict function key {predict_function_key}")

# %%
# The data, the function and the metric are now registered.
# The data, the functions and the metric are now registered.

# %%
# Registering tasks
Expand Down Expand Up @@ -312,7 +309,7 @@
#
# In deployed mode, the registered task is added to a queue and treated asynchronously: this means that the
# code that registers the tasks keeps executing. To wait for a task to be done, create a loop and get the task
# every n seconds until its status is done or failed.
# every ``n`` seconds until its status is done or failed.

model_input = [
InputRef(
Expand Down