forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Persisting results of historical retrieval (feast-dev#2197)
* persisting results of historical retrieval Signed-off-by: pyalex <moskalenko.alexey@gmail.com> * fix after rebase Signed-off-by: pyalex <moskalenko.alexey@gmail.com>
- Loading branch information
Showing
25 changed files
with
1,262 additions
and
262 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,3 +14,4 @@ | |
|
||
{% page-ref page="point-in-time-joins.md" %} | ||
|
||
{% page-ref page="dataset.md" %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Dataset | ||
|
||
Feast datasets allow for conveniently saving dataframes that include both features and entities to be subsequently used for data analysis and model training. | ||
[Data Quality Monitoring](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98) was the primary motivation for creating dataset concept. | ||
|
||
Dataset's metadata is stored in the Feast registry and raw data (features, entities, additional input keys and timestamp) is stored in the [offline store](../architecture-and-components/offline-store.md). | ||
|
||
Dataset can be created from: | ||
1. Results of historical retrieval | ||
2. [planned] Logging request (including input for [on demand transformation](../../reference/alpha-on-demand-feature-view.md)) and response during feature serving | ||
3. [planned] Logging features during writing to online store (from batch source or stream) | ||
|
||
|
||
### Creating Saved Dataset from Historical Retrieval | ||
|
||
To create a saved dataset from historical features for later retrieval or analysis, a user needs to call `get_historical_features` method first and then pass the returned retrieval job to `create_saved_dataset` method. | ||
`create_saved_dataset` will trigger provided retrieval job (by calling `.persist()` on it) to store the data using specified `storage`. | ||
Storage type must be the same as globally configured offline store (eg, it's impossible to persist data to Redshift with BigQuery source). | ||
`create_saved_dataset` will also create SavedDataset object with all related metadata and will write it to the registry. | ||
|
||
```python | ||
from feast import FeatureStore | ||
from feast.infra.offline_stores.bigquery_source import SavedDatasetBigQueryStorage | ||
|
||
store = FeatureStore() | ||
|
||
historical_job = store.get_historical_features( | ||
features=["driver:avg_trip"], | ||
entity_df=..., | ||
) | ||
|
||
dataset = store.create_saved_dataset( | ||
from_=historical_job, | ||
name='my_training_dataset', | ||
storage=SavedDatasetBigQueryStorage(table_ref='<gcp-project>.<gcp-dataset>.my_training_dataset'), | ||
tags={'author': 'oleksii'} | ||
) | ||
|
||
dataset.to_df() | ||
``` | ||
|
||
Saved dataset can be later retrieved using `get_saved_dataset` method: | ||
```python | ||
dataset = store.get_saved_dataset('my_training_dataset') | ||
dataset.to_df() | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
// | ||
// Copyright 2021 The Feast Authors | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
// | ||
|
||
|
||
syntax = "proto3"; | ||
|
||
package feast.core; | ||
option java_package = "feast.proto.core"; | ||
option java_outer_classname = "SavedDatasetProto"; | ||
option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; | ||
|
||
import "google/protobuf/timestamp.proto"; | ||
import "feast/core/FeatureViewProjection.proto"; | ||
import "feast/core/DataSource.proto"; | ||
|
||
message SavedDatasetSpec { | ||
// Name of the dataset. Must be unique since it's possible to overwrite dataset by name | ||
string name = 1; | ||
|
||
// Name of Feast project that this Dataset belongs to. | ||
string project = 2; | ||
|
||
// list of feature references with format "<view name>:<feature name>" | ||
repeated string features = 3; | ||
|
||
// entity columns + request columns from all feature views used during retrieval | ||
repeated string join_keys = 4; | ||
|
||
// Whether full feature names are used in stored data | ||
bool full_feature_names = 5; | ||
|
||
SavedDatasetStorage storage = 6; | ||
|
||
// User defined metadata | ||
map<string, string> tags = 7; | ||
} | ||
|
||
message SavedDatasetStorage { | ||
oneof kind { | ||
DataSource.FileOptions file_storage = 4; | ||
DataSource.BigQueryOptions bigquery_storage = 5; | ||
DataSource.RedshiftOptions redshift_storage = 6; | ||
} | ||
} | ||
|
||
message SavedDatasetMeta { | ||
// Time when this saved dataset is created | ||
google.protobuf.Timestamp created_timestamp = 1; | ||
|
||
// Time when this saved dataset is last updated | ||
google.protobuf.Timestamp last_updated_timestamp = 2; | ||
|
||
// Min timestamp in the dataset (needed for retrieval) | ||
google.protobuf.Timestamp min_event_timestamp = 3; | ||
|
||
// Max timestamp in the dataset (needed for retrieval) | ||
google.protobuf.Timestamp max_event_timestamp = 4; | ||
} | ||
|
||
message SavedDataset { | ||
SavedDatasetSpec spec = 1; | ||
SavedDatasetMeta meta = 2; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.