Skip to content

Commit

Permalink
[per-stream cdk] Support deserialization of legacy and per-stream sta…
Browse files Browse the repository at this point in the history
…te (airbytehq#16205)

* interpret legacy and new per-stream format into AirbyteStateMessages

* add ConnectorStateManager stubs for future work

* remove frozen for the time being until we need to hash descriptors

* add validation that AirbyteStateMessage has at least one of stream, global, or data fields

* pr feedback and clean up of the code

* remove changes to airbyte_protocol and perform validation in read_state()

* fix import formatting
  • Loading branch information
brianjlai authored Sep 7, 2022
1 parent fd66f1f commit 1d9608c
Show file tree
Hide file tree
Showing 5 changed files with 322 additions and 23 deletions.
15 changes: 10 additions & 5 deletions airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#


import copy
import logging
from abc import ABC, abstractmethod
from datetime import datetime
from functools import lru_cache
from typing import Any, Dict, Iterator, List, Mapping, MutableMapping, Optional, Tuple
from typing import Any, Dict, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union

from airbyte_cdk.models import (
AirbyteCatalog,
Expand All @@ -22,6 +20,7 @@
SyncMode,
)
from airbyte_cdk.models import Type as MessageType
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
from airbyte_cdk.sources.source import Source
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.http.http import HttpStream
Expand Down Expand Up @@ -91,10 +90,12 @@ def read(
logger: logging.Logger,
config: Mapping[str, Any],
catalog: ConfiguredAirbyteCatalog,
state: MutableMapping[str, Any] = None,
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
) -> Iterator[AirbyteMessage]:
"""Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-protocol."""
connector_state = copy.deepcopy(state or {})
state_manager = ConnectorStateManager(state=state)
connector_state = state_manager.get_legacy_state()

logger.info(f"Starting syncing {self.name}")
config, internal_config = split_config(config)
# TODO assert all streams exist in the connector
Expand Down Expand Up @@ -133,6 +134,10 @@ def read(

logger.info(f"Finished syncing {self.name}")

@property
def per_stream_state_enabled(self):
return False # While CDK per-stream is in active development we should keep this off

def _read_stream(
self,
logger: logging.Logger,
Expand Down
54 changes: 54 additions & 0 deletions airbyte-cdk/python/airbyte_cdk/sources/connector_state_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

import copy
from typing import Any, List, Mapping, MutableMapping, Union

from airbyte_cdk.models import AirbyteStateBlob, AirbyteStateMessage, AirbyteStateType


class ConnectorStateManager:
"""
ConnectorStateManager consolidates the various forms of a stream's incoming state message (STREAM / GLOBAL / LEGACY) under a common
interface. It also provides methods to extract and update state
"""

# In the immediate, we only persist legacy which will be used during abstract_source.read(). In the subsequent PRs we will
# initialize the ConnectorStateManager according to the new per-stream interface received from the platform
def __init__(self, state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None):
if not state:
self.legacy = {}
elif self.is_migrated_legacy_state(state):
# The legacy state format received from the platform is parsed and stored as a single AirbyteStateMessage when reading
# the file. This is used for input backwards compatibility.
self.legacy = state[0].data
elif isinstance(state, MutableMapping):
# In the event that legacy state comes in as its original JSON object format, no changes to the input need to be made
self.legacy = state
else:
raise ValueError("Input state should come in the form of list of Airbyte state messages or a mapping of states")

def get_stream_state(self, namespace: str, stream_name: str) -> AirbyteStateBlob:
# todo implement in upcoming PRs
pass

def get_legacy_state(self) -> MutableMapping[str, Any]:
"""
Returns a deep copy of the current legacy state dictionary made up of the state of all streams for a connector
:return: A copy of the legacy state
"""
return copy.deepcopy(self.legacy, {})

def update_state_for_stream(self, namespace: str, stream_name: str, value: Mapping[str, Any]):
# todo implement in upcoming PRs
pass

@staticmethod
def is_migrated_legacy_state(state: List[AirbyteStateMessage]) -> bool:
return (
isinstance(state, List)
and len(state) == 1
and isinstance(state[0], AirbyteStateMessage)
and state[0].type == AirbyteStateType.LEGACY
)
39 changes: 30 additions & 9 deletions airbyte-cdk/python/airbyte_cdk/sources/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
import json
import logging
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Any, Dict, Generic, Iterable, Mapping, MutableMapping, TypeVar
from typing import Any, Generic, Iterable, List, Mapping, MutableMapping, TypeVar, Union

from airbyte_cdk.connector import BaseConnector, DefaultConnectorMixin, TConfig
from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, ConfiguredAirbyteCatalog
from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, AirbyteStateMessage, AirbyteStateType, ConfiguredAirbyteCatalog

TState = TypeVar("TState")
TCatalog = TypeVar("TCatalog")
Expand Down Expand Up @@ -39,15 +38,37 @@ def discover(self, logger: logging.Logger, config: TConfig) -> AirbyteCatalog:
"""


class Source(DefaultConnectorMixin, BaseSource[Mapping[str, Any], MutableMapping[str, Any], ConfiguredAirbyteCatalog], ABC):
class Source(
DefaultConnectorMixin,
BaseSource[Mapping[str, Any], Union[List[AirbyteStateMessage], MutableMapping[str, Any]], ConfiguredAirbyteCatalog],
ABC,
):
# can be overridden to change an input state
def read_state(self, state_path: str) -> Dict[str, Any]:
def read_state(self, state_path: str) -> List[AirbyteStateMessage]:
"""
Retrieves the input state of a sync by reading from the specified JSON file. Incoming state can be deserialized into either
a JSON object for legacy state input or as a list of AirbyteStateMessages for the per-stream state format. Regardless of the
incoming input type, it will always be transformed and output as a list of AirbyteStateMessage(s).
:param state_path: The filepath to where the stream states are located
:return: The complete stream state based on the connector's previous sync
"""
if state_path:
state_obj = json.loads(open(state_path, "r").read())
else:
state_obj = {}
state = defaultdict(dict, state_obj)
return state
if not state_obj:
return []
is_per_stream_state = isinstance(state_obj, List)
if is_per_stream_state:
parsed_state_messages = []
for state in state_obj:
parsed_message = AirbyteStateMessage.parse_obj(state)
if not parsed_message.stream and not parsed_message.data and not parsed_message.global_:
raise ValueError("AirbyteStateMessage should contain either a stream, global, or state field")
parsed_state_messages.append(parsed_message)
return parsed_state_messages
else:
# When the legacy JSON object format is received, always outputting an AirbyteStateMessage simplifies processing downstream
return [AirbyteStateMessage(type=AirbyteStateType.LEGACY, data=state_obj)]
return []

# can be overridden to change an input catalog
def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

from contextlib import nullcontext as does_not_raise

import pytest
from airbyte_cdk.models import AirbyteStateMessage, AirbyteStateType
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager


@pytest.mark.parametrize(
"input_state, expected_legacy_state, expected_error",
[
pytest.param(
[AirbyteStateMessage(type=AirbyteStateType.LEGACY, data={"actresses": {"id": "seehorn_rhea"}})],
{"actresses": {"id": "seehorn_rhea"}},
does_not_raise(),
id="test_legacy_input_state",
),
pytest.param(
{
"actors": {"created_at": "1962-10-22"},
"actresses": {"id": "seehorn_rhea"},
},
{"actors": {"created_at": "1962-10-22"}, "actresses": {"id": "seehorn_rhea"}},
does_not_raise(),
id="test_supports_legacy_json_blob",
),
pytest.param({}, {}, does_not_raise(), id="test_initialize_empty_mapping_by_default"),
pytest.param([], {}, does_not_raise(), id="test_initialize_empty_state"),
pytest.param("strings_are_not_allowed", None, pytest.raises(ValueError), id="test_value_error_is_raised_on_invalid_state_input"),
],
)
def test_get_legacy_state(input_state, expected_legacy_state, expected_error):
with expected_error:
state_manager = ConnectorStateManager(input_state)
actual_legacy_state = state_manager.get_legacy_state()
assert actual_legacy_state == expected_legacy_state
Loading

0 comments on commit 1d9608c

Please sign in to comment.