Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/automerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name: Python application
on:
push:

branches: [ "main", "218-fix-padding-token-mismatch-and-logging-tokenization-metadata" ]
branches: [ "main", "237-copy-purge-model-storage-functionality-for-main" ]


permissions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from multiprocessing import Process, Lock
import os
from gc import collect
from shutil import rmtree


# import optuna
Expand Down Expand Up @@ -565,18 +566,31 @@ def run_random_search(self):
print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
print(
f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
def has_valid_metric(num):
try:
float(num)
return True
except Exception as exc:
print(exc)
return False
# ~ pd.to_numeric(x['a'], errors="coerce").astype(float).isna()
# rows_having_a_valid_metric = oracles[self.metric_to_rank_by].apply(lambda x: has_valid_metric(x))
rows_having_a_valid_metric = ~ pd.to_numeric(oracles[self.metric_to_rank_by], errors="coerce").isna()
oracles_having_valid_metrics = oracles[rows_having_a_valid_metric]

if self.direction == "maximize" or self.direction == "max":

best = float(oracles[oracles[self.metric_to_rank_by]
!= self.metric_to_rank_by]
[self.metric_to_rank_by].astype(float).max())
best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).max())
# best = float(oracles[oracles[self.metric_to_rank_by]
# != self.metric_to_rank_by]
# [self.metric_to_rank_by].astype(float).max())
else:
print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
print(
f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
best = float(oracles[oracles[self.metric_to_rank_by]
!= self.metric_to_rank_by]
[self.metric_to_rank_by].astype(float).min())
best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).min())
# best = float(oracles[oracles[self.metric_to_rank_by]
# != self.metric_to_rank_by]
# [self.metric_to_rank_by].astype(float).min())
print(f"Best result this trial was: {best}")
print(f"Type of best result: {type(best)}")
self.best_model_path =\
Expand All @@ -585,8 +599,63 @@ def run_random_search(self):
print(f"Best model name: {self.best_model_path}")
return best

def get_best_model(self):
def purge_model_storage(self) -> None:
"""Slates all cached models.
Recommended when running in a container without a mounted volume.
It is recommened to use an artifiact registry to accession the best model.
"""
model_cache_path = f"{self.project_name}/models"
rmtree(model_cache_path)


def purge_models_except_best_model(self) -> None:
"""
Recommended when running in a container without a mounted volume and building models that take considerable time to reproduce.
It is recommened to use an artifiact registry to accession the best model, but this will preserve a redundant
copy in case accessioning it to a registry is unsuccessful.
"""
if not self.best_model_path:
return ValueError("The function purge_models_except_best_model was called prematurely: self.best_model_path is not set, maining there is no 'Best model'.")
model_cache_path = f"{self.project_name}/models"
files_path_obj = os.listdir(model_cache_path)
files_str = [str(p) for p in files_path_obj]
print("Files in model cache:")
for file in files_str:
model_file_path = f"{model_cache_path}/{file}"
print(f" {model_file_path}")
if model_file_path != self.best_model_path:
print(f"Removing: {model_file_path}")
os.remove(model_file_path)
# Temp debug code:
else:
print(f"Not removing {model_file_path}")


def get_best_model(self, purge_model_storage_files=0) -> tf.keras.Model:
"""Returns the best model from this meta-trial.
Optionally, purges cache of models stored on disk.

Params:
- purge_model_storage_files Union[str, int]
- Set to 0: Does not purge the cached modelsl, just returns the best model.
- Set to 1: Purges all models except the best model found.
- Set to "slate": Removes all models, whether the best or otherwise.
When running ephemeral trials in a container without a mounted volume (to prevent
memory pressure accumulating from ephemeral files in memory) or are otherwise working
with hard disk space limitations, we recommend setting this:
- 'slate': if you are working on models that are quick to reproduce and an accidental model loss is not problematic as long as you have the parameters to reproduce it approximately.
- 1: If you are are workign on models that take considerable time to reproduce a given model or a small performance difference from another model from the same parameters is problematic.
- 0 If you have unlimited disk space and are not in a container or in one with a suitable mounted volume.
"""
best_model = tf.keras.models.load_model(self.best_model_path)
if purge_model_storage_files == 1:
self.purge_models_except_best_model()
elif purge_model_storage_files == "slate":
self.purge_model_storage()
elif purge_model_storage_files == 0:
pass
else:
raise ValueError("The paramerter purge_model_storage_files in the method get_best_model() has 3 values: 0 (Don't purge),1 (Purge all but the best model), 'slate' (remove all cached models) ")
return best_model

# ->
Expand Down
3 changes: 2 additions & 1 deletion phishing_email_detection_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ def from_config(cls, config):




print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
print(f'Cerebros best accuracy achieved is {result}')
Expand All @@ -524,7 +525,7 @@ def from_config(cls, config):

MODEL_FILE_NAME = "cerebros-foundation-model.keras"

best_model_found = cerebros_automl.get_best_model()
best_model_found = cerebros_automl.get_best_model(purge_model_storage_files=1)
best_model_found.save(MODEL_FILE_NAME)
del(best_model_found)
del(cerebros_automl)
Expand Down
15 changes: 13 additions & 2 deletions regression-example-ames-no-preproc-val-set.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval
from os.path import exists

NUMBER_OF_TRAILS_PER_BATCH = 2
NUMBER_OF_BATCHES_OF_TRIALS = 2

META_TRIAL_NUMBER = 1

###

LABEL_COLUMN = 'price'
Expand All @@ -24,6 +27,7 @@
.replace(':', '_')\
.replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
PROJECT_NAME = f"{PROJECT_NAME}-meta-{META_TRIAL_NUMBER}"

def hash_a_row(row):
"""casts a row of a Pandas DataFrame as a String, hashes it, and casts it
Expand Down Expand Up @@ -207,16 +211,23 @@ def hash_based_split(df, # Pandas dataframe
metrics=[tf.keras.metrics.RootMeanSquaredError()],
epochs=epochs,
patience=7,
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
project_name=PROJECT_NAME,
# use_multiprocessing_for_multiple_neural_networks=False, # pull this param
model_graphs='model_graphs',
batch_size=batch_size,
meta_trial_number=meta_trial_number)
result = cerebros.run_random_search()

print("Best model: (May need to re-initialize weights, and retrain with early stopping callback)")
best_model_found = cerebros.get_best_model()
best_model_found = cerebros.get_best_model(purge_model_storage_files='slate')
print(best_model_found.summary())


# Verify purge_model_storage_files works:
model_storage_path = f"{PROJECT_NAME}/models"
if exists(model_storage_path):
raise ValueError(f"Failed test: Parh {model_storage_path} should have beed deleted and was not.")


print("result extracted from cerebros")
print(f"Final result was (val_root_mean_squared_error): {result}")
20 changes: 16 additions & 4 deletions regression-example-ames-no-preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval
from os import listdir
from os.path import exists


NUMBER_OF_TRAILS_PER_BATCH = 2
NUMBER_OF_BATCHES_OF_TRIALS = 2
Expand All @@ -20,15 +23,15 @@

## your data:

META_TRIAL_NUMBER = 1

TIME = pendulum.now().__str__()[:16]\
.replace('T', '_')\
.replace(':', '_')\
.replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'

PROJECT_NAME = f"{TIME}_cerebros_auto_ml_test"
PROJECT_NAME = f"{PROJECT_NAME}_meta_{META_TRIAL_NUMBER}"

# white = pd.read_csv('wine_data.csv')

raw_data = pd.read_csv('ames.csv')
needed_cols = [
Expand Down Expand Up @@ -110,7 +113,7 @@
metrics=[tf.keras.metrics.RootMeanSquaredError()],
epochs=epochs,
patience=7,
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
project_name=PROJECT_NAME,
# use_multiprocessing_for_multiple_neural_networks=False, # pull this param
model_graphs='model_graphs',
batch_size=batch_size,
Expand All @@ -121,5 +124,14 @@
best_model_found = cerebros.get_best_model()
print(best_model_found.summary())

# Validate that purge_model_storage is NOT active by default

model_storage_path = f"{PROJECT_NAME}/models"
assert exists(model_storage_path)
num_items = len(listdir(model_storage_path))
print(f"There are {num_items} items in {model_storage_path}.")
if num_items <= 0:
raise ValueError(f"Failed test: {model_storage_path} was deleted and should not have been.")

print("result extracted from cerebros")
print(f"Final result was (val_root_mean_squared_error): {result}")