Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: CLI interface for validation of logged features #2718

Merged
merged 9 commits into from
May 20, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
CLI test
Signed-off-by: Oleksii Moskalenko <moskalenko.alexey@gmail.com>
  • Loading branch information
pyalex committed May 19, 2022
commit 0e29fa31ac1478735da693d5336dd79253ddd94d
12 changes: 5 additions & 7 deletions sdk/python/feast/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,14 +759,15 @@ def disable_alpha_features(ctx: click.Context):
store.config.write_to_path(Path(repo_path))


@cli.command("validate")
@click.option(
"--feature_service", "-f", help="Specify a feature service name",
"--feature-service", "-f", help="Specify a feature service name",
)
@click.option(
"--reference", "-r", help="Specify a validation reference name",
)
@click.option(
"--store-profile-cache", is_flag=True, help="Store cached profile in registry",
"--no-profile-cache", is_flag=True, help="Do not store cached profile in registry",
)
@click.argument("start_ts")
@click.argument("end_ts")
Expand All @@ -777,7 +778,7 @@ def validate(
reference: str,
start_ts: str,
end_ts: str,
store_profile_cache,
no_profile_cache,
):
"""
Perform validation of logged features (produced by a given feature service) against provided reference.
Expand All @@ -797,14 +798,11 @@ def validate(
start=datetime.fromisoformat(start_ts),
end=datetime.fromisoformat(end_ts),
throw_exception=False,
cache_profile=not no_profile_cache,
)

if not result:
print(f"{Style.BRIGHT + Fore.GREEN}Validation successful!{Style.RESET_ALL}")

if store_profile_cache:
store.apply(reference)

return

errors = [e.to_dict() for e in result.report.errors]
Expand Down
4 changes: 4 additions & 0 deletions sdk/python/feast/dqm/profilers/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class ValidationError:

missing_count: Optional[int]
missing_percent: Optional[float]
observed_value: Optional[float]

def __init__(
self,
Expand All @@ -77,12 +78,14 @@ def __init__(
check_config: Optional[Any] = None,
missing_count: Optional[int] = None,
missing_percent: Optional[float] = None,
observed_value: Optional[float] = None,
):
self.check_name = check_name
self.column_name = column_name
self.check_config = check_config
self.missing_count = missing_count
self.missing_percent = missing_percent
self.observed_value = observed_value

def __repr__(self):
return f"<ValidationError {self.check_name}:{self.column_name}>"
Expand All @@ -94,4 +97,5 @@ def to_dict(self):
check_config=self.check_config,
missing_count=self.missing_count,
missing_percent=self.missing_percent,
observed_value=self.observed_value,
)
12 changes: 11 additions & 1 deletion sdk/python/feast/feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -2054,6 +2054,7 @@ def serve_transformations(self, port: int) -> None:
def _teardown_go_server(self):
self._go_server = None

@log_exceptions_and_usage
def write_logged_features(
self, logs: Union[pa.Table, Path], source: Union[FeatureService]
):
Expand Down Expand Up @@ -2081,13 +2082,15 @@ def write_logged_features(
registry=self._registry,
)

@log_exceptions_and_usage
def validate_logged_features(
self,
source: Union[FeatureService],
start: datetime,
end: datetime,
reference: ValidationReference,
throw_exception: bool = True,
cache_profile: bool = True,
) -> Optional[ValidationFailed]:
"""
Load logged features from an offline store and validate them against provided validation reference.
Expand All @@ -2098,6 +2101,7 @@ def validate_logged_features(
end: upper bound for loading logged features
reference: validation reference
throw_exception: throw exception or return it as a result
cache_profile: store cached profile in Feast registry

Returns:
Throw or return (depends on parameter) ValidationFailed exception if validation was not successful
Expand Down Expand Up @@ -2131,8 +2135,12 @@ def validate_logged_features(

return exc

if cache_profile:
self.apply(reference)

return None

@log_exceptions_and_usage
def get_validation_reference(
self, name: str, allow_cache: bool = False
) -> ValidationReference:
Expand All @@ -2142,9 +2150,11 @@ def get_validation_reference(
Raises:
ValidationReferenceNotFoundException: The validation reference could not be found.
"""
return self._registry.get_validation_reference(
ref = self._registry.get_validation_reference(
name, project=self.project, allow_cache=allow_cache
)
ref._dataset = self.get_saved_dataset(ref.dataset_name)
return ref


def _validate_entity_values(join_key_values: Dict[str, List[Value]]):
Expand Down
4 changes: 2 additions & 2 deletions sdk/python/feast/infra/passthrough_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,6 @@ def retrieve_feature_service_logs(
join_key_columns=[],
feature_name_columns=columns,
timestamp_field=ts_column,
start_date=start_date,
end_date=end_date,
start_date=make_tzaware(start_date),
end_date=make_tzaware(end_date),
)
4 changes: 1 addition & 3 deletions sdk/python/feast/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,9 +924,7 @@ def get_validation_reference(
validation_reference.name == name
and validation_reference.project == project
):
return ValidationReference.from_proto(
validation_reference, list(registry_proto.saved_datasets)
)
return ValidationReference.from_proto(validation_reference)
raise ValidationReferenceNotFound(name, project=project)

def delete_validation_reference(self, name: str, project: str, commit: bool = True):
Expand Down
72 changes: 46 additions & 26 deletions sdk/python/feast/saved_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,60 +208,80 @@ def to_arrow(self) -> pyarrow.Table:
return self._retrieval_job.to_arrow()

def as_reference(self, name: str, profiler: "Profiler") -> "ValidationReference":
return ValidationReference(name=name, profiler=profiler, reference_dataset=self)
return ValidationReference.from_saved_dataset(
name=name, profiler=profiler, dataset=self
)

def get_profile(self, profiler: Profiler) -> Profile:
return profiler.analyze_dataset(self.to_df())


class ValidationReference:
name: str
reference_dataset: SavedDataset
dataset_name: str
profiler: Profiler

_cached_profile: Optional[Profile] = None
_profile: Optional[Profile] = None
_dataset: Optional[SavedDataset] = None

def __init__(self, name: str, reference_dataset: SavedDataset, profiler: Profiler):
def __init__(self, name: str, dataset_name: str, profiler: Profiler):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add description and tags here, since this is effectively a new top level object in feast?

self.name = name
self.reference_dataset = reference_dataset
self.dataset_name = dataset_name
self.profiler = profiler

@classmethod
def from_saved_dataset(cls, name: str, dataset: SavedDataset, profiler: Profiler):
ref = ValidationReference(name, dataset.name, profiler)
ref._dataset = dataset
return ref

@property
def profile(self) -> Profile:
if not self._cached_profile:
self._cached_profile = self.profiler.analyze_dataset(
self.reference_dataset.to_df()
)
return self._cached_profile
if not self._profile:
if not self._dataset:
raise RuntimeError(
"In order to calculate a profile validation reference must be instantiated from a saved dataset. "
"Use ValidationReference.from_saved_dataset constructor or FeatureStore.get_validation_reference "
"to get validation reference object."
)

self._profile = self.profiler.analyze_dataset(self._dataset.to_df())
return self._profile

@classmethod
def from_proto(
cls, proto: ValidationReferenceProto, saved_datasets: List[SavedDatasetProto]
) -> "ValidationReference":
profiler = GEProfiler.from_proto(proto.ge_profiler)
saved_dataset_proto = [
sd for sd in saved_datasets if sd.spec.name == proto.reference_dataset_name
][0]
self = ValidationReference(
def from_proto(cls, proto: ValidationReferenceProto) -> "ValidationReference":
profiler_attr = proto.WhichOneof("profiler")
if profiler_attr == "ge_profiler":
profiler = GEProfiler.from_proto(proto.ge_profiler)
else:
raise RuntimeError("Unrecognized profiler")

profile_attr = proto.WhichOneof("cached_profile")
if profile_attr == "ge_profile":
profile = GEProfile.from_proto(proto.ge_profile)
elif not profile_attr:
profile = None
else:
raise RuntimeError("Unrecognized profile")

ref = ValidationReference(
name=proto.name,
reference_dataset=SavedDataset.from_proto(saved_dataset_proto),
dataset_name=proto.reference_dataset_name,
profiler=profiler,
)
ref._profile = profile

if proto.ge_profile:
self._cached_profile = GEProfile.from_proto(proto.ge_profile)

return self
return ref

def to_proto(self) -> ValidationReferenceProto:
proto = ValidationReferenceProto(
name=self.name,
reference_dataset_name=self.reference_dataset.name,
reference_dataset_name=self.dataset_name,
ge_profiler=self.profiler.to_proto()
if isinstance(self.profiler, GEProfiler)
else None,
ge_profile=self._cached_profile.to_proto()
if isinstance(self._cached_profile, GEProfile)
ge_profile=self._profile.to_proto()
if isinstance(self._profile, GEProfile)
else None,
)

Expand Down
88 changes: 88 additions & 0 deletions sdk/python/tests/integration/e2e/test_validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import datetime
import shutil
from pathlib import Path

import pandas as pd
import pyarrow as pa
Expand All @@ -24,6 +26,7 @@
driver,
location,
)
from tests.utils.cli_utils import CliRunner
from tests.utils.logged_features import prepare_logs

_features = [
Expand Down Expand Up @@ -257,3 +260,88 @@ def validate():

success = wait_retry_backoff(validate, timeout_secs=30)
assert success, "Validation failed (unexpectedly)"


@pytest.mark.integration
def test_e2e_validation_via_cli(environment, universal_data_sources):
runner = CliRunner()
store = environment.feature_store

(_, datasets, data_sources) = universal_data_sources
feature_views = construct_universal_feature_views(data_sources)
feature_service = FeatureService(
name="test_service",
features=[
feature_views.customer[
["current_balance", "avg_passenger_count", "lifetime_trip_count"]
],
],
logging_config=LoggingConfig(
destination=environment.data_source_creator.create_logged_features_destination()
),
)
store.apply([customer(), feature_service, feature_views.customer])

entity_df = datasets.entity_df.drop(
columns=["order_id", "origin_id", "destination_id", "driver_id"]
)
retrieval_job = store.get_historical_features(
entity_df=entity_df, features=feature_service, full_feature_names=True
)
logs_df = prepare_logs(retrieval_job.to_df(), feature_service, store)
saved_dataset = store.create_saved_dataset(
from_=retrieval_job,
name="reference_for_validating_logged_features",
storage=environment.data_source_creator.create_saved_dataset_destination(),
)
reference = saved_dataset.as_reference(
name="test_reference", profiler=configurable_profiler
)

schema = FeatureServiceLoggingSource(
feature_service=feature_service, project=store.project
).get_schema(store._registry)
store.write_logged_features(
pa.Table.from_pandas(logs_df, schema=schema), source=feature_service
)

with runner.local_repo(example_repo_py="", offline_store="file") as local_repo:
local_repo.apply(
[customer(), feature_views.customer, feature_service, reference]
)
local_repo._registry.apply_saved_dataset(saved_dataset, local_repo.project)
validate_args = [
"validate",
"--feature-service",
feature_service.name,
"--reference",
reference.name,
(datetime.datetime.utcnow() - datetime.timedelta(days=7)).isoformat(),
datetime.datetime.utcnow().isoformat(),
]
p = runner.run(validate_args, cwd=local_repo.repo_path)

assert p.returncode == 0, p.stderr.decode()
assert "Validation successful" in p.stdout.decode(), p.stderr.decode()

# make sure second validation will use cached profile
shutil.rmtree(saved_dataset.storage.file_options.uri)

# Add some invalid data that would lead to failed validation
invalid_data = pd.DataFrame(
data={
"customer_id": [0],
"current_balance": [0],
"avg_passenger_count": [0],
"lifetime_trip_count": [0],
"event_timestamp": [datetime.datetime.utcnow()],
}
)
invalid_logs = prepare_logs(invalid_data, feature_service, store)
store.write_logged_features(
pa.Table.from_pandas(invalid_logs, schema=schema), source=feature_service
)

p = runner.run(validate_args, cwd=local_repo.repo_path)
assert p.returncode == 1, p.stdout.decode()
assert "Validation failed" in p.stdout.decode(), p.stderr.decode()
4 changes: 3 additions & 1 deletion sdk/python/tests/utils/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ class CliRunner:
"""

def run(self, args: List[str], cwd: Path) -> subprocess.CompletedProcess:
return subprocess.run([sys.executable, cli.__file__] + args, cwd=cwd)
return subprocess.run(
[sys.executable, cli.__file__] + args, cwd=cwd, capture_output=True
)

def run_with_output(self, args: List[str], cwd: Path) -> Tuple[int, bytes]:
try:
Expand Down
7 changes: 6 additions & 1 deletion sdk/python/tests/utils/logged_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,13 @@ def prepare_logs(

for projection in feature_service.feature_view_projections:
for feature in projection.features:
logs_df[f"{projection.name_to_use()}__{feature.name}"] = source_df[
source_field = (
feature.name
if feature.name in source_df.columns
else f"{projection.name_to_use()}__{feature.name}"
)
logs_df[f"{projection.name_to_use()}__{feature.name}"] = source_df[
source_field
]
logs_df[
f"{projection.name_to_use()}__{feature.name}__timestamp"
Expand Down