@@ -34,35 +34,26 @@ To track changes and version everything about data Cascade has Datasets - specia
34
34
that encapsulate changes that are done during preprocessing.
35
35
36
36
``` python
37
+ from pprint import pprint
37
38
from cascade import data as cdd
38
-
39
39
from sklearn.datasets import load_digits
40
40
import numpy as np
41
41
42
42
43
- # Load dataset
44
43
X, y = load_digits(return_X_y = True )
45
44
pairs = [(x, y) for (x, y) in zip (X, y)]
46
45
47
- # To track all preparation stages we wrap cdd.Dataset over
48
- # collection of items and targets
46
+ # To track all preparation stages we wrap cdd.Dataset
49
47
ds = cdd.Wrapper(pairs)
50
48
51
- # Let's make a pipeline - shuffle the dataset
49
+ # This creates pipeline
52
50
ds = cdd.RandomSampler(ds)
53
-
54
- # Splitting the data is also tracked in pipeline's metadata
55
51
train_ds, test_ds = cdd.split(ds)
56
-
57
- # Add small noise to images
58
52
train_ds = cdd.ApplyModifier(
59
53
train_ds,
60
54
lambda pair : pair + np.random.random() * 0.1 - 0.05
61
55
)
62
56
63
- # Let's see the metadata we got
64
- from pprint import pprint
65
-
66
57
pprint(train_ds.get_meta())
67
58
```
68
59
@@ -103,72 +94,36 @@ See all use-cases in [documentation](https://oxid15.github.io/cascade/quickstart
103
94
Not only data and pipelines changes over time. Models change more frequently and require special system to handle experiments and artifacts.
104
95
105
96
``` python
97
+ import random
106
98
from cascade import models as cdm
107
99
from cascade import data as cdd
108
100
109
- from sklearn.datasets import load_breast_cancer
110
- from sklearn.model_selection import train_test_split
111
- from sklearn.metrics import accuracy_score, f1_score
112
-
113
-
114
- X, y = load_breast_cancer(return_X_y = True )
115
- X_train, X_test, y_train, y_test = train_test_split(X, y)
116
-
117
- # Define the simple model that using
118
- # basic methods from cdm.BasicModel
119
- class BaselineModel (cdm .BasicModel ):
120
- def __init__ (self , const = 0 , * args , ** kwargs ) -> None :
121
- self .const = const
122
- super ().__init__ (const = const, * args, ** kwargs)
123
-
124
- def predict (self , x , * args , ** kwargs ):
125
- return [self .const for _ in range (len (x))]
126
-
127
- # Models define the way whey are trained loaded and saved
128
- # we don't use these here, but they exist
129
- def fit (self , * args , ** kwargs ):
130
- pass
131
-
132
- def save (self , path ):
133
- pass
101
+ model = cdm.Model()
102
+ model.metrics.update({
103
+ ' acc' : random.random()
104
+ })
134
105
135
-
136
- model = BaselineModel(1 )
137
-
138
- # Fit and evaluate do not return anything
139
- model.fit(X_train, y_train)
140
- model.evaluate(X_test, y_test, {' acc' : accuracy_score, ' f1' : f1_score})
141
-
142
- # Model repository is the solution for experiment and artifact storage
106
+ # Repo is the collection of model lines
143
107
repo = cdm.ModelRepo(' repos/use_case_repo' )
144
108
145
- # Repo is the collection of model lines
146
109
# Line can be a bunch of experiments on one model type
147
110
line = repo.add_line(' baseline' )
148
-
149
- # We save the model - everything is held automatically
150
111
line.save(model, only_meta = True )
151
-
152
- from pprint import pprint
153
- pprint(model.get_meta())
154
112
```
155
113
156
114
Let's see what is saved as meta data of this experiment.
157
115
158
116
``` json
159
117
[
160
118
{
161
- "name" : " <__main__.BaselineModel object at 0x000001F69F493820> " ,
162
- "created_at" : " 2023-01-02T16:36:59.041979 +00:00" ,
119
+ "name" : " cascade.models.model.Model " ,
120
+ "created_at" : " 2023-05-29T21:06:23.341752 +00:00" ,
163
121
"metrics" : {
164
- "acc" : 0.6293706293706294 ,
165
- "f1" : 0.7725321888412017
166
- },
167
- "params" : {
168
- "const" : 1
122
+ "acc" : 0.6745652975946803
169
123
},
124
+ "params" : {},
170
125
"type" : " model" ,
171
- "saved_at" : " 2023-01-02T16:36:59.103781 +00:00"
126
+ "saved_at" : " 2023-05-29T21:06:25.977728 +00:00"
172
127
}
173
128
]
174
129
```
@@ -189,24 +144,20 @@ from sklearn.datasets import load_digits
189
144
import numpy as np
190
145
191
146
192
- # Load data
193
147
X, y = load_digits(return_X_y = True )
194
148
pairs = [(x, y) for (x, y) in zip (X, y)]
195
149
196
- # Let's define a pipeline
197
150
ds = cdd.Wrapper(pairs)
198
151
ds = cdd.RandomSampler(ds)
199
152
train_ds, test_ds = cdd.split(ds)
200
153
201
- # Validate using this tool
202
154
cme.PredicateValidator(
203
155
train_ds,
204
156
[
205
157
lambda pair : all (pair[0 ] < 20 ),
206
158
lambda pair : pair[1 ] in (i for i in range (10 ))
207
159
]
208
160
)
209
-
210
161
```
211
162
212
163
See all use-cases in [ documentation] ( https://oxid15.github.io/cascade/quickstart.html )
@@ -221,7 +172,6 @@ metrics of all models in repository.
221
172
from cascade import meta as cme
222
173
from cascade import models as cdm
223
174
224
- # Open the existing repo
225
175
repo = cdm.ModelRepo(' repos/use_case_repo' )
226
176
227
177
# This runs web-server that relies on optional dependency
@@ -244,7 +194,6 @@ cme.HistoryViewer(repo).plot()
244
194
245
195
# This runs a server ans allows to see changes in real time (for example while models are trained)
246
196
cme.HistoryViewer(repo).serve()
247
-
248
197
```
249
198
250
199
See all use-cases in [ documentation] ( https://oxid15.github.io/cascade/quickstart.html )
0 commit comments