diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/destination_definitions.yaml b/airbyte-config-oss/init-oss/src/main/resources/seed/destination_definitions.yaml index c2f135bd7554e..cb3445d0ef3ff 100644 --- a/airbyte-config-oss/init-oss/src/main/resources/seed/destination_definitions.yaml +++ b/airbyte-config-oss/init-oss/src/main/resources/seed/destination_definitions.yaml @@ -377,7 +377,7 @@ - name: Snowflake destinationDefinitionId: 424892c4-daac-4491-b35d-c6688ba547ba dockerRepository: airbyte/destination-snowflake - dockerImageTag: 1.0.1 + dockerImageTag: 1.0.2 documentationUrl: https://docs.airbyte.com/integrations/destinations/snowflake icon: snowflake.svg normalizationConfig: diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/destination_specs.yaml b/airbyte-config-oss/init-oss/src/main/resources/seed/destination_specs.yaml index 34d4db768358e..77135465e7701 100644 --- a/airbyte-config-oss/init-oss/src/main/resources/seed/destination_specs.yaml +++ b/airbyte-config-oss/init-oss/src/main/resources/seed/destination_specs.yaml @@ -6604,7 +6604,7 @@ supported_destination_sync_modes: - "overwrite" - "append" -- dockerImage: "airbyte/destination-snowflake:1.0.1" +- dockerImage: "airbyte/destination-snowflake:1.0.2" spec: documentationUrl: "https://docs.airbyte.com/integrations/destinations/snowflake" connectionSpecification: diff --git a/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json b/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json index a2439d5b72cae..2d264fce93ac1 100644 --- a/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json +++ b/airbyte-config-oss/init-oss/src/main/resources/seed/oss_catalog.json @@ -6380,7 +6380,7 @@ "destinationDefinitionId": "424892c4-daac-4491-b35d-c6688ba547ba", "name": "Snowflake", "dockerRepository": "airbyte/destination-snowflake", - "dockerImageTag": "1.0.1", + "dockerImageTag": "1.0.2", "documentationUrl": "https://docs.airbyte.com/integrations/destinations/snowflake", "icon": "snowflake.svg", "spec": { diff --git a/airbyte-integrations/bases/base-java-async/.dockerignore b/airbyte-integrations/bases/base-java-async/.dockerignore new file mode 100644 index 0000000000000..70cd13cb50b78 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/.dockerignore @@ -0,0 +1,5 @@ +* +!Dockerfile +!build +!javabase.sh +!run_with_normalization.sh diff --git a/airbyte-integrations/bases/base-java-async/Dockerfile b/airbyte-integrations/bases/base-java-async/Dockerfile new file mode 100644 index 0000000000000..34e5d0cdbf104 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/Dockerfile @@ -0,0 +1,26 @@ +ARG JDK_VERSION=17.0.4 +FROM amazoncorretto:${JDK_VERSION} +COPY --from=airbyte/integration-base:dev /airbyte /airbyte + +RUN yum install -y tar openssl && yum clean all + +WORKDIR /airbyte + +# Add the Datadog Java APM agent +ADD https://dtdg.co/latest-java-tracer dd-java-agent.jar + +COPY javabase.sh . +COPY run_with_normalization.sh . + +# airbyte base commands +ENV AIRBYTE_SPEC_CMD "/airbyte/javabase.sh --spec" +ENV AIRBYTE_CHECK_CMD "/airbyte/javabase.sh --check" +ENV AIRBYTE_DISCOVER_CMD "/airbyte/javabase.sh --discover" +ENV AIRBYTE_READ_CMD "/airbyte/javabase.sh --read" +ENV AIRBYTE_WRITE_CMD "/airbyte/javabase.sh --write" + +ENV AIRBYTE_ENTRYPOINT "/airbyte/base.sh" +ENTRYPOINT ["/airbyte/base.sh"] + +LABEL io.airbyte.version=0.1.2 +LABEL io.airbyte.name=airbyte/integration-base-java diff --git a/airbyte-integrations/bases/base-java-async/build.gradle b/airbyte-integrations/bases/base-java-async/build.gradle new file mode 100644 index 0000000000000..6bbbf4e847ff5 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/build.gradle @@ -0,0 +1,30 @@ +plugins { + id 'java-library' + id 'airbyte-docker' +} + +dependencies { + implementation libs.airbyte.protocol + implementation project(':airbyte-config-oss:config-models-oss') + implementation project(':airbyte-commons-cli') + implementation project(':airbyte-json-validation') + + implementation 'commons-cli:commons-cli:1.4' + implementation 'net.i2p.crypto:eddsa:0.3.0' + implementation 'org.apache.sshd:sshd-mina:2.8.0' + // bouncycastle is pinned to version-match the transitive dependency from kubernetes client-java + // because a version conflict causes "parameter object not a ECParameterSpec" on ssh tunnel initiation + implementation 'org.bouncycastle:bcprov-jdk15on:1.66' + implementation 'org.bouncycastle:bcpkix-jdk15on:1.66' + implementation 'org.bouncycastle:bctls-jdk15on:1.66' + + implementation libs.jackson.annotations + implementation libs.connectors.testcontainers + implementation libs.connectors.testcontainers.jdbc + implementation libs.bundles.datadog + + implementation files(project(':airbyte-integrations:bases:base').airbyteDocker.outputs) + + testImplementation 'commons-lang:commons-lang:2.6' + implementation group: 'org.apache.logging.log4j', name: 'log4j-layout-template-json', version: '2.17.2' +} diff --git a/airbyte-integrations/bases/base-java-async/javabase.sh b/airbyte-integrations/bases/base-java-async/javabase.sh new file mode 100755 index 0000000000000..34302052d40bc --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/javabase.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -e + +# if IS_CAPTURE_HEAP_DUMP_ON_ERROR is set to true, then will capture Heap dump on OutOfMemory error +if [[ $IS_CAPTURE_HEAP_DUMP_ON_ERROR = true ]]; then + + arrayOfSupportedConnectors=("source-postgres" "source-mssql" "source-mysql" ) + + # The heap dump would be captured only in case when java-based connector fails with OutOfMemory error + if [[ " ${arrayOfSupportedConnectors[*]} " =~ " $APPLICATION " ]]; then + JAVA_OPTS=$JAVA_OPTS" -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/data/dump.hprof" + export JAVA_OPTS + echo "Added JAVA_OPTS=$JAVA_OPTS" + echo "APPLICATION=$APPLICATION" + fi +fi + +# Wrap run script in a script so that we can lazy evaluate the value of APPLICATION. APPLICATION is +# set by the dockerfile that inherits base-java, so it cannot be evaluated when base-java is built. +# We also need to make sure that stdin of the script is piped to the stdin of the java application. +if [[ $A = --write ]]; then + cat <&0 | /airbyte/bin/"$APPLICATION" "$@" +else + /airbyte/bin/"$APPLICATION" "$@" +fi diff --git a/airbyte-integrations/bases/base-java-async/run_with_normalization.sh b/airbyte-integrations/bases/base-java-async/run_with_normalization.sh new file mode 100755 index 0000000000000..261284ef5a982 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/run_with_normalization.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Intentionally no set -e, because we want to run normalization even if the destination fails +set -o pipefail + +/airbyte/base.sh $@ +destination_exit_code=$? + +if test "$1" != 'write' +then + normalization_exit_code=0 +elif test "$NORMALIZATION_TECHNIQUE" = 'LEGACY' +then + echo '{"type": "LOG","log":{"level":"INFO","message":"Starting in-connector normalization"}}' + # the args in a write command are `write --catalog foo.json --config bar.json` + # so if we remove the `write`, we can just pass the rest directly into normalization + /airbyte/entrypoint.sh run ${@:2} --integration-type $AIRBYTE_NORMALIZATION_INTEGRATION | java -cp "/airbyte/lib/*" io.airbyte.integrations.destination.normalization.NormalizationLogParser + normalization_exit_code=$? + echo '{"type": "LOG","log":{"level":"INFO","message":"Completed in-connector normalization"}}' +else + echo '{"type": "LOG","log":{"level":"INFO","message":"Skipping in-connector normalization"}}' + normalization_exit_code=0 +fi + +if test $destination_exit_code -ne 0 +then + exit $destination_exit_code +elif test $normalization_exit_code -ne 0 +then + exit $normalization_exit_code +else + exit 0 +fi diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/BaseConnector.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/BaseConnector.java new file mode 100644 index 0000000000000..0d7233c1aaaa6 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/BaseConnector.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations; + +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.resources.MoreResources; +import io.airbyte.integrations.base.Integration; +import io.airbyte.protocol.models.v0.ConnectorSpecification; + +public abstract class BaseConnector implements Integration { + + /** + * By convention the spec is stored as a resource for java connectors. That resource is called + * spec.json. + * + * @return specification. + * @throws Exception - any exception. + */ + @Override + public ConnectorSpecification spec() throws Exception { + // return a JsonSchema representation of the spec for the integration. + final String resourceString = MoreResources.readResource("spec.json"); + return Jsons.deserialize(resourceString, ConnectorSpecification.class); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteExceptionHandler.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteExceptionHandler.java new file mode 100644 index 0000000000000..be4e1a4d914c4 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteExceptionHandler.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AirbyteExceptionHandler implements Thread.UncaughtExceptionHandler { + + private static final Logger LOGGER = LoggerFactory.getLogger(AirbyteExceptionHandler.class); + public static final String logMessage = "Something went wrong in the connector. See the logs for more details."; + + @Override + public void uncaughtException(Thread t, Throwable e) { + // This is a naive AirbyteTraceMessage emission in order to emit one when any error occurs in a + // connector. + // If a connector implements AirbyteTraceMessage emission itself, this code will result in an + // additional one being emitted. + // this is fine tho because: + // "The earliest AirbyteTraceMessage where type=error will be used to populate the FailureReason for + // the sync." + // from the spec: + // https://docs.google.com/document/d/1ctrj3Yh_GjtQ93aND-WH3ocqGxsmxyC3jfiarrF6NY0/edit# + LOGGER.error(logMessage, e); + AirbyteTraceMessageUtility.emitSystemErrorTrace(e, logMessage); + terminate(); + } + + // by doing this in a separate method we can mock it to avoid closing the jvm and therefore test + // properly + protected void terminate() { + System.exit(1); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteMessageConsumer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteMessageConsumer.java new file mode 100644 index 0000000000000..8322ff9ed0f22 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteMessageConsumer.java @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import io.airbyte.commons.concurrency.VoidCallable; +import io.airbyte.commons.functional.CheckedConsumer; +import io.airbyte.protocol.models.v0.AirbyteMessage; + +/** + * Interface for the destination's consumption of incoming records wrapped in an + * {@link io.airbyte.protocol.models.v0.AirbyteMessage}. + * + * This is via the accept method, which commonly handles parsing, validation, batching and writing + * of the transformed data to the final destination i.e. the technical system data is being written + * to. + * + * Lifecycle: + * + * We encourage implementing this interface using the {@link FailureTrackingAirbyteMessageConsumer} + * class. + */ +public interface AirbyteMessageConsumer extends CheckedConsumer, AutoCloseable { + + void start() throws Exception; + + /** + * Consumes all {@link AirbyteMessage}s + * + * @param message {@link AirbyteMessage} to be processed + * @throws Exception + */ + @Override + void accept(AirbyteMessage message) throws Exception; + + /** + * Executes at the end of consumption of all incoming streamed data regardless of success or failure + * + * @throws Exception + */ + @Override + void close() throws Exception; + + /** + * Append a function to be called on {@link AirbyteMessageConsumer#close}. + */ + static AirbyteMessageConsumer appendOnClose(final AirbyteMessageConsumer consumer, final VoidCallable voidCallable) { + return new AirbyteMessageConsumer() { + + @Override + public void start() throws Exception { + consumer.start(); + } + + @Override + public void accept(final AirbyteMessage message) throws Exception { + consumer.accept(message); + } + + @Override + public void close() throws Exception { + consumer.close(); + voidCallable.call(); + } + + }; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteTraceMessageUtility.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteTraceMessageUtility.java new file mode 100644 index 0000000000000..d7e1e524bd134 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/AirbyteTraceMessageUtility.java @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import io.airbyte.commons.stream.AirbyteStreamStatusHolder; +import io.airbyte.protocol.models.v0.AirbyteErrorTraceMessage; +import io.airbyte.protocol.models.v0.AirbyteErrorTraceMessage.FailureType; +import io.airbyte.protocol.models.v0.AirbyteEstimateTraceMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteTraceMessage; +import java.util.function.Consumer; +import org.apache.commons.lang3.exception.ExceptionUtils; + +public final class AirbyteTraceMessageUtility { + + private AirbyteTraceMessageUtility() {} + + public static void emitSystemErrorTrace(final Throwable e, final String displayMessage) { + emitErrorTrace(e, displayMessage, FailureType.SYSTEM_ERROR); + } + + public static void emitConfigErrorTrace(final Throwable e, final String displayMessage) { + emitErrorTrace(e, displayMessage, FailureType.CONFIG_ERROR); + } + + public static void emitEstimateTrace(final long byteEstimate, + final AirbyteEstimateTraceMessage.Type type, + final long rowEstimate, + final String streamName, + final String streamNamespace) { + emitMessage(makeAirbyteMessageFromTraceMessage( + makeAirbyteTraceMessage(AirbyteTraceMessage.Type.ESTIMATE) + .withEstimate(new AirbyteEstimateTraceMessage() + .withByteEstimate(byteEstimate) + .withType(type) + .withRowEstimate(rowEstimate) + .withName(streamName) + .withNamespace(streamNamespace)))); + } + + public static void emitErrorTrace(final Throwable e, final String displayMessage, final FailureType failureType) { + emitMessage(makeErrorTraceAirbyteMessage(e, displayMessage, failureType)); + } + + public static void emitStreamStatusTrace(final AirbyteStreamStatusHolder airbyteStreamStatusHolder) { + emitMessage(makeStreamStatusTraceAirbyteMessage(airbyteStreamStatusHolder)); + } + + // todo: handle the other types of trace message we'll expect in the future, see + // io.airbyte.protocol.models.v0.AirbyteTraceMessage + // & the tech spec: + // https://docs.google.com/document/d/1ctrj3Yh_GjtQ93aND-WH3ocqGxsmxyC3jfiarrF6NY0/edit# + // public void emitNotificationTrace() {} + // public void emitMetricTrace() {} + + private static void emitMessage(final AirbyteMessage message) { + // Not sure why defaultOutputRecordCollector is under Destination specifically, + // but this matches usage elsewhere in base-java + final Consumer outputRecordCollector = Destination::defaultOutputRecordCollector; + outputRecordCollector.accept(message); + } + + private static AirbyteMessage makeErrorTraceAirbyteMessage( + final Throwable e, + final String displayMessage, + final FailureType failureType) { + + return makeAirbyteMessageFromTraceMessage( + makeAirbyteTraceMessage(AirbyteTraceMessage.Type.ERROR) + .withError(new AirbyteErrorTraceMessage() + .withFailureType(failureType) + .withMessage(displayMessage) + .withInternalMessage(e.toString()) + .withStackTrace(ExceptionUtils.getStackTrace(e)))); + } + + private static AirbyteMessage makeStreamStatusTraceAirbyteMessage(final AirbyteStreamStatusHolder airbyteStreamStatusHolder) { + return makeAirbyteMessageFromTraceMessage(airbyteStreamStatusHolder.toTraceMessage()); + } + + private static AirbyteMessage makeAirbyteMessageFromTraceMessage(final AirbyteTraceMessage airbyteTraceMessage) { + return new AirbyteMessage().withType(Type.TRACE).withTrace(airbyteTraceMessage); + } + + private static AirbyteTraceMessage makeAirbyteTraceMessage(final AirbyteTraceMessage.Type traceMessageType) { + return new AirbyteTraceMessage().withType(traceMessageType).withEmittedAt((double) System.currentTimeMillis()); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Command.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Command.java new file mode 100644 index 0000000000000..e37502894bb89 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Command.java @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +public enum Command { + SPEC, + CHECK, + DISCOVER, + READ, + WRITE +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/CommitOnStateAirbyteMessageConsumer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/CommitOnStateAirbyteMessageConsumer.java new file mode 100644 index 0000000000000..b7fded66b551c --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/CommitOnStateAirbyteMessageConsumer.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import java.util.function.Consumer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Minimal abstract class intended to handle the case where the destination can commit records every + * time a state message appears. This class does that commit and then immediately emits the state + * message. This should only be used in cases when the commit is relatively cheap. immediately. + */ +public abstract class CommitOnStateAirbyteMessageConsumer extends FailureTrackingAirbyteMessageConsumer implements AirbyteMessageConsumer { + + private static final Logger LOGGER = LoggerFactory.getLogger(CommitOnStateAirbyteMessageConsumer.class); + + private final Consumer outputRecordCollector; + + public CommitOnStateAirbyteMessageConsumer(final Consumer outputRecordCollector) { + this.outputRecordCollector = outputRecordCollector; + } + + @Override + public void accept(final AirbyteMessage message) throws Exception { + if (message.getType() == Type.STATE) { + commit(); + outputRecordCollector.accept(message); + } + super.accept(message); + } + + public abstract void commit() throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Destination.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Destination.java new file mode 100644 index 0000000000000..60e979f01c465 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Destination.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.util.function.Consumer; + +public interface Destination extends Integration { + + /** + * Return a consumer that writes messages to the destination. + * + * @param config - integration-specific configuration object as json. e.g. { "username": "airbyte", + * "password": "super secure" } + * @param catalog - schema of the incoming messages. + * @return Consumer that accepts message. The {@link AirbyteMessageConsumer#accept(AirbyteMessage)} + * will be called n times where n is the number of messages. + * {@link AirbyteMessageConsumer#close()} will always be called once regardless of success + * or failure. + * @throws Exception - any exception. + */ + AirbyteMessageConsumer getConsumer(JsonNode config, + ConfiguredAirbyteCatalog catalog, + Consumer outputRecordCollector) + throws Exception; + + static void defaultOutputRecordCollector(final AirbyteMessage message) { + System.out.println(Jsons.serialize(message)); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/FailureTrackingAirbyteMessageConsumer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/FailureTrackingAirbyteMessageConsumer.java new file mode 100644 index 0000000000000..cc31dc4505fc0 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/FailureTrackingAirbyteMessageConsumer.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import io.airbyte.protocol.models.v0.AirbyteMessage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Minimal abstract class intended to provide a consistent structure to classes seeking to implement + * the {@link AirbyteMessageConsumer} interface. The original interface methods are wrapped in + * generic exception handlers - any exception is caught and logged. + * + * Two methods are intended for extension: + *
    + *
  • startTracked: Wraps set up of necessary infrastructure/configuration before message + * consumption.
  • + *
  • acceptTracked: Wraps actual processing of each + * {@link io.airbyte.protocol.models.v0.AirbyteMessage}.
  • + *
+ * + * Though not necessary, we highly encourage using this class when implementing destinations. See + * child classes for examples. + */ +public abstract class FailureTrackingAirbyteMessageConsumer implements AirbyteMessageConsumer { + + private static final Logger LOGGER = LoggerFactory.getLogger(FailureTrackingAirbyteMessageConsumer.class); + + private boolean hasFailed = false; + + /** + * Wraps setup of necessary infrastructure/configuration before message consumption + * + * @throws Exception + */ + protected abstract void startTracked() throws Exception; + + @Override + public void start() throws Exception { + try { + startTracked(); + } catch (final Exception e) { + LOGGER.error("Exception while starting consumer", e); + hasFailed = true; + throw e; + } + } + + /** + * Processing of AirbyteMessages with general functionality of storing STATE messages, serializing + * RECORD messages and storage within a buffer + * + * NOTE: Not all the functionality mentioned above is always true but generally applies + * + * @param msg {@link AirbyteMessage} to be processed + * @throws Exception + */ + protected abstract void acceptTracked(AirbyteMessage msg) throws Exception; + + @Override + public void accept(final AirbyteMessage msg) throws Exception { + try { + acceptTracked(msg); + } catch (final Exception e) { + LOGGER.error("Exception while accepting message", e); + hasFailed = true; + throw e; + } + } + + protected abstract void close(boolean hasFailed) throws Exception; + + @Override + public void close() throws Exception { + if (hasFailed) { + LOGGER.warn("Airbyte message consumer: failed."); + } else { + LOGGER.info("Airbyte message consumer: succeeded."); + } + close(hasFailed); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Integration.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Integration.java new file mode 100644 index 0000000000000..049cec7ccfaa3 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Integration.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.ConnectorSpecification; + +public interface Integration { + + /** + * Fetch the specification for the integration. + * + * @return specification. + * @throws Exception - any exception. + */ + ConnectorSpecification spec() throws Exception; + + /** + * Check whether, given the current configuration, the integration can connect to the integration. + * + * @param config - integration-specific configuration object as json. e.g. { "username": "airbyte", + * "password": "super secure" } + * @return Whether or not the connection was successful. Optional message if it was not. + * @throws Exception - any exception. + */ + AirbyteConnectionStatus check(JsonNode config) throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationCliParser.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationCliParser.java new file mode 100644 index 0000000000000..87e36d75db9d4 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationCliParser.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.google.common.base.Preconditions; +import io.airbyte.commons.cli.Clis; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionGroup; +import org.apache.commons.cli.Options; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// todo (cgardens) - use argparse4j.github.io instead of org.apache.commons.cli to leverage better +// sub-parser support. + +/** + * Parses command line args to a type safe config object for each command type. + */ +public class IntegrationCliParser { + + private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationCliParser.class); + + private static final OptionGroup COMMAND_GROUP; + + static { + final var optionGroup = new OptionGroup(); + optionGroup.setRequired(true); + + optionGroup.addOption(Option.builder() + .longOpt(Command.SPEC.toString().toLowerCase()) + .desc("outputs the json configuration specification") + .build()); + optionGroup.addOption(Option.builder() + .longOpt(Command.CHECK.toString().toLowerCase()) + .desc("checks the config can be used to connect") + .build()); + optionGroup.addOption(Option.builder() + .longOpt(Command.DISCOVER.toString().toLowerCase()) + .desc("outputs a catalog describing the source's catalog") + .build()); + optionGroup.addOption(Option.builder() + .longOpt(Command.READ.toString().toLowerCase()) + .desc("reads the source and outputs messages to STDOUT") + .build()); + optionGroup.addOption(Option.builder() + .longOpt(Command.WRITE.toString().toLowerCase()) + .desc("writes messages from STDIN to the integration") + .build()); + + COMMAND_GROUP = optionGroup; + } + + public IntegrationConfig parse(final String[] args) { + final Command command = parseCommand(args); + return parseOptions(args, command); + } + + private static Command parseCommand(final String[] args) { + final Options options = new Options(); + options.addOptionGroup(COMMAND_GROUP); + + final CommandLine parsed = Clis.parse(args, options, Clis.getRelaxedParser()); + return Command.valueOf(parsed.getOptions()[0].getLongOpt().toUpperCase()); + } + + private static IntegrationConfig parseOptions(final String[] args, final Command command) { + + final Options options = new Options(); + options.addOptionGroup(COMMAND_GROUP); // so that the parser does not throw an exception when encounter command args. + + switch (command) { + case SPEC -> { + // no args. + } + case CHECK, DISCOVER -> options.addOption(Option.builder() + .longOpt(JavaBaseConstants.ARGS_CONFIG_KEY) + .desc(JavaBaseConstants.ARGS_CONFIG_DESC) + .hasArg(true) + .required(true) + .build()); + case READ -> { + options.addOption(Option.builder() + .longOpt(JavaBaseConstants.ARGS_CONFIG_KEY) + .desc(JavaBaseConstants.ARGS_CONFIG_DESC) + .hasArg(true) + .required(true) + .build()); + options.addOption(Option.builder() + .longOpt(JavaBaseConstants.ARGS_CATALOG_KEY) + .desc(JavaBaseConstants.ARGS_CATALOG_DESC) + .hasArg(true) + .build()); + options.addOption(Option.builder() + .longOpt(JavaBaseConstants.ARGS_STATE_KEY) + .desc(JavaBaseConstants.ARGS_PATH_DESC) + .hasArg(true) + .build()); + } + case WRITE -> { + options.addOption(Option.builder() + .longOpt(JavaBaseConstants.ARGS_CONFIG_KEY) + .desc(JavaBaseConstants.ARGS_CONFIG_DESC) + .hasArg(true) + .required(true).build()); + options.addOption(Option.builder() + .longOpt(JavaBaseConstants.ARGS_CATALOG_KEY) + .desc(JavaBaseConstants.ARGS_CATALOG_DESC) + .hasArg(true) + .build()); + } + default -> throw new IllegalStateException("Unexpected value: " + command); + } + + final CommandLine parsed = Clis.parse(args, options, command.toString().toLowerCase()); + Preconditions.checkNotNull(parsed); + final Map argsMap = new HashMap<>(); + for (final Option option : parsed.getOptions()) { + argsMap.put(option.getLongOpt(), option.getValue()); + } + LOGGER.info("integration args: {}", argsMap); + + switch (command) { + case SPEC -> { + return IntegrationConfig.spec(); + } + case CHECK -> { + return IntegrationConfig.check(Path.of(argsMap.get(JavaBaseConstants.ARGS_CONFIG_KEY))); + } + case DISCOVER -> { + return IntegrationConfig.discover(Path.of(argsMap.get(JavaBaseConstants.ARGS_CONFIG_KEY))); + } + case READ -> { + return IntegrationConfig.read( + Path.of(argsMap.get(JavaBaseConstants.ARGS_CONFIG_KEY)), + Path.of(argsMap.get(JavaBaseConstants.ARGS_CATALOG_KEY)), + argsMap.containsKey(JavaBaseConstants.ARGS_STATE_KEY) ? Path.of(argsMap.get(JavaBaseConstants.ARGS_STATE_KEY)) : null); + } + case WRITE -> { + return IntegrationConfig.write( + Path.of(argsMap.get(JavaBaseConstants.ARGS_CONFIG_KEY)), + Path.of(argsMap.get(JavaBaseConstants.ARGS_CATALOG_KEY))); + } + default -> throw new IllegalStateException("Unexpected value: " + command); + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationConfig.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationConfig.java new file mode 100644 index 0000000000000..438ecceb9f420 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationConfig.java @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.google.common.base.Preconditions; +import java.nio.file.Path; +import java.util.Objects; +import java.util.Optional; + +public class IntegrationConfig { + + private final Command command; + private final Path configPath; + private final Path catalogPath; + private final Path statePath; + + private IntegrationConfig(final Command command, final Path configPath, final Path catalogPath, final Path statePath) { + this.command = command; + this.configPath = configPath; + this.catalogPath = catalogPath; + this.statePath = statePath; + } + + public static IntegrationConfig spec() { + return new IntegrationConfig(Command.SPEC, null, null, null); + } + + public static IntegrationConfig check(final Path config) { + Preconditions.checkNotNull(config); + return new IntegrationConfig(Command.CHECK, config, null, null); + } + + public static IntegrationConfig discover(final Path config) { + Preconditions.checkNotNull(config); + return new IntegrationConfig(Command.DISCOVER, config, null, null); + } + + public static IntegrationConfig read(final Path configPath, final Path catalogPath, final Path statePath) { + Preconditions.checkNotNull(configPath); + Preconditions.checkNotNull(catalogPath); + return new IntegrationConfig(Command.READ, configPath, catalogPath, statePath); + } + + public static IntegrationConfig write(final Path configPath, final Path catalogPath) { + Preconditions.checkNotNull(configPath); + Preconditions.checkNotNull(catalogPath); + return new IntegrationConfig(Command.WRITE, configPath, catalogPath, null); + } + + public Command getCommand() { + return command; + } + + public Path getConfigPath() { + Preconditions.checkState(command != Command.SPEC); + return configPath; + } + + public Path getCatalogPath() { + Preconditions.checkState(command == Command.READ || command == Command.WRITE); + return catalogPath; + } + + public Optional getStatePath() { + Preconditions.checkState(command == Command.READ); + return Optional.ofNullable(statePath); + } + + @Override + public String toString() { + return "IntegrationConfig{" + + "command=" + command + + ", configPath='" + configPath + '\'' + + ", catalogPath='" + catalogPath + '\'' + + ", statePath='" + statePath + '\'' + + '}'; + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final IntegrationConfig that = (IntegrationConfig) o; + return command == that.command && + Objects.equals(configPath, that.configPath) && + Objects.equals(catalogPath, that.catalogPath) && + Objects.equals(statePath, that.statePath); + } + + @Override + public int hashCode() { + return Objects.hash(command, configPath, catalogPath, statePath); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationRunner.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationRunner.java new file mode 100644 index 0000000000000..3f58d694a0fcb --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/IntegrationRunner.java @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonPropertyDescription; +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import datadog.trace.api.Trace; +import io.airbyte.commons.io.IOs; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.lang.Exceptions.Procedure; +import io.airbyte.commons.string.Strings; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.integrations.util.ApmTraceUtils; +import io.airbyte.integrations.util.ConnectorExceptionUtil; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.validation.json.JsonSchemaValidator; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.List; +import java.util.Optional; +import java.util.Scanner; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.apache.commons.lang3.ThreadUtils; +import org.apache.commons.lang3.concurrent.BasicThreadFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Accepts EITHER a destination or a source. Routes commands from the commandline to the appropriate + * methods on the integration. Keeps itself DRY for methods that are common between source and + * destination. + */ +public class IntegrationRunner { + + private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationRunner.class); + + public static final int INTERRUPT_THREAD_DELAY_MINUTES = 60; + public static final int EXIT_THREAD_DELAY_MINUTES = 70; + + public static final int FORCED_EXIT_CODE = 2; + + private final IntegrationCliParser cliParser; + private final Consumer outputRecordCollector; + private final Integration integration; + private final Destination destination; + private final Source source; + private static JsonSchemaValidator validator; + + public IntegrationRunner(final Destination destination) { + this(new IntegrationCliParser(), Destination::defaultOutputRecordCollector, destination, null); + } + + public IntegrationRunner(final Source source) { + this(new IntegrationCliParser(), Destination::defaultOutputRecordCollector, null, source); + } + + @VisibleForTesting + IntegrationRunner(final IntegrationCliParser cliParser, + final Consumer outputRecordCollector, + final Destination destination, + final Source source) { + Preconditions.checkState(destination != null ^ source != null, "can only pass in a destination or a source"); + this.cliParser = cliParser; + this.outputRecordCollector = outputRecordCollector; + // integration iface covers the commands that are the same for both source and destination. + this.integration = source != null ? source : destination; + this.source = source; + this.destination = destination; + validator = new JsonSchemaValidator(); + + Thread.setDefaultUncaughtExceptionHandler(new AirbyteExceptionHandler()); + } + + @VisibleForTesting + IntegrationRunner(final IntegrationCliParser cliParser, + final Consumer outputRecordCollector, + final Destination destination, + final Source source, + final JsonSchemaValidator jsonSchemaValidator) { + this(cliParser, outputRecordCollector, destination, source); + validator = jsonSchemaValidator; + } + + @Trace(operationName = "RUN_OPERATION") + public void run(final String[] args) throws Exception { + final IntegrationConfig parsed = cliParser.parse(args); + try { + runInternal(parsed); + } catch (final Exception e) { + throw e; + } + } + + private void runInternal(final IntegrationConfig parsed) throws Exception { + LOGGER.info("Running integration: {}", integration.getClass().getName()); + LOGGER.info("Command: {}", parsed.getCommand()); + LOGGER.info("Integration config: {}", parsed); + + try { + switch (parsed.getCommand()) { + // common + case SPEC -> outputRecordCollector.accept(new AirbyteMessage().withType(Type.SPEC).withSpec(integration.spec())); + case CHECK -> { + final JsonNode config = parseConfig(parsed.getConfigPath()); + try { + validateConfig(integration.spec().getConnectionSpecification(), config, "CHECK"); + } catch (final Exception e) { + // if validation fails don't throw an exception, return a failed connection check message + outputRecordCollector.accept(new AirbyteMessage().withType(Type.CONNECTION_STATUS).withConnectionStatus( + new AirbyteConnectionStatus().withStatus(AirbyteConnectionStatus.Status.FAILED).withMessage(e.getMessage()))); + } + + outputRecordCollector.accept(new AirbyteMessage().withType(Type.CONNECTION_STATUS).withConnectionStatus(integration.check(config))); + } + // source only + case DISCOVER -> { + final JsonNode config = parseConfig(parsed.getConfigPath()); + validateConfig(integration.spec().getConnectionSpecification(), config, "DISCOVER"); + outputRecordCollector.accept(new AirbyteMessage().withType(Type.CATALOG).withCatalog(source.discover(config))); + } + // todo (cgardens) - it is incongruous that that read and write return airbyte message (the + // envelope) while the other commands return what goes inside it. + case READ -> { + final JsonNode config = parseConfig(parsed.getConfigPath()); + validateConfig(integration.spec().getConnectionSpecification(), config, "READ"); + final ConfiguredAirbyteCatalog catalog = parseConfig(parsed.getCatalogPath(), ConfiguredAirbyteCatalog.class); + final Optional stateOptional = parsed.getStatePath().map(IntegrationRunner::parseConfig); + try (final AutoCloseableIterator messageIterator = source.read(config, catalog, stateOptional.orElse(null))) { + produceMessages(messageIterator); + } + } + // destination only + case WRITE -> { + final JsonNode config = parseConfig(parsed.getConfigPath()); + validateConfig(integration.spec().getConnectionSpecification(), config, "WRITE"); + final ConfiguredAirbyteCatalog catalog = parseConfig(parsed.getCatalogPath(), ConfiguredAirbyteCatalog.class); + try (final AirbyteMessageConsumer consumer = destination.getConsumer(config, catalog, outputRecordCollector)) { + runConsumer(consumer); + } + } + default -> throw new IllegalStateException("Unexpected value: " + parsed.getCommand()); + } + } catch (final Exception e) { + // Many of the exceptions thrown are nested inside layers of RuntimeExceptions. An attempt is made + // to + // find the root exception that corresponds to a configuration error. If that does not exist, we + // just return the original exception. + ApmTraceUtils.addExceptionToTrace(e); + final Throwable rootThrowable = ConnectorExceptionUtil.getRootConfigError(e); + final String displayMessage = ConnectorExceptionUtil.getDisplayMessage(rootThrowable); + // If the source connector throws a config error, a trace message with the relevant message should + // be surfaced. + if (ConnectorExceptionUtil.isConfigError(rootThrowable)) { + AirbyteTraceMessageUtility.emitConfigErrorTrace(e, displayMessage); + } + if (parsed.getCommand().equals(Command.CHECK)) { + // Currently, special handling is required for the CHECK case since the user display information in + // the trace message is + // not properly surfaced to the FE. In the future, we can remove this and just throw an exception. + outputRecordCollector + .accept( + new AirbyteMessage() + .withType(Type.CONNECTION_STATUS) + .withConnectionStatus( + new AirbyteConnectionStatus() + .withStatus(AirbyteConnectionStatus.Status.FAILED) + .withMessage(displayMessage))); + return; + } + throw e; + } + + LOGGER.info("Completed integration: {}", integration.getClass().getName()); + } + + private void produceMessages(final AutoCloseableIterator messageIterator) throws Exception { + watchForOrphanThreads( + () -> messageIterator.forEachRemaining(outputRecordCollector), + () -> System.exit(FORCED_EXIT_CODE), + INTERRUPT_THREAD_DELAY_MINUTES, + TimeUnit.MINUTES, + EXIT_THREAD_DELAY_MINUTES, + TimeUnit.MINUTES); + } + + @VisibleForTesting + static void consumeWriteStream(final AirbyteMessageConsumer consumer) throws Exception { + // use a Scanner that only processes new line characters to strictly abide with the + // https://jsonlines.org/ standard + final Scanner input = new Scanner(System.in, StandardCharsets.UTF_8).useDelimiter("[\r\n]+"); + consumer.start(); + while (input.hasNext()) { + consumeMessage(consumer, input.next()); + } + } + + private static void runConsumer(final AirbyteMessageConsumer consumer) throws Exception { + watchForOrphanThreads( + () -> consumeWriteStream(consumer), + () -> System.exit(FORCED_EXIT_CODE), + INTERRUPT_THREAD_DELAY_MINUTES, + TimeUnit.MINUTES, + EXIT_THREAD_DELAY_MINUTES, + TimeUnit.MINUTES); + } + + /** + * This method calls a runMethod and make sure that it won't produce orphan non-daemon active + * threads once it is done. Active non-daemon threads blocks JVM from exiting when the main thread + * is done, whereas daemon ones don't. + * + * If any active non-daemon threads would be left as orphans, this method will schedule some + * interrupt/exit hooks after giving it some time delay to close up properly. It is generally + * preferred to have a proper closing sequence from children threads instead of interrupting or + * force exiting the process, so this mechanism serve as a fallback while surfacing warnings in logs + * for maintainers to fix the code behavior instead. + */ + @VisibleForTesting + static void watchForOrphanThreads(final Procedure runMethod, + final Runnable exitHook, + final int interruptTimeDelay, + final TimeUnit interruptTimeUnit, + final int exitTimeDelay, + final TimeUnit exitTimeUnit) + throws Exception { + final Thread currentThread = Thread.currentThread(); + try { + runMethod.call(); + } finally { + final List runningThreads = ThreadUtils.getAllThreads() + .stream() + // daemon threads don't block the JVM if the main `currentThread` exits, so they are not problematic + .filter(runningThread -> !runningThread.getName().equals(currentThread.getName()) && !runningThread.isDaemon()) + .collect(Collectors.toList()); + if (!runningThreads.isEmpty()) { + LOGGER.warn(""" + The main thread is exiting while children non-daemon threads from a connector are still active. + Ideally, this situation should not happen... + Please check with maintainers if the connector or library code should safely clean up its threads before quitting instead. + The main thread is: {}""", dumpThread(currentThread)); + final ScheduledExecutorService scheduledExecutorService = Executors + .newSingleThreadScheduledExecutor(new BasicThreadFactory.Builder() + // this thread executor will create daemon threads, so it does not block exiting if all other active + // threads are already stopped. + .daemon(true).build()); + for (final Thread runningThread : runningThreads) { + final String str = "Active non-daemon thread: " + dumpThread(runningThread); + LOGGER.warn(str); + // even though the main thread is already shutting down, we still leave some chances to the children + // threads to close properly on their own. + // So, we schedule an interrupt hook after a fixed time delay instead... + scheduledExecutorService.schedule(runningThread::interrupt, interruptTimeDelay, interruptTimeUnit); + } + scheduledExecutorService.schedule(() -> { + if (ThreadUtils.getAllThreads().stream() + .anyMatch(runningThread -> !runningThread.isDaemon() && !runningThread.getName().equals(currentThread.getName()))) { + LOGGER.error("Failed to interrupt children non-daemon threads, forcefully exiting NOW...\n"); + exitHook.run(); + } + }, exitTimeDelay, exitTimeUnit); + } + } + } + + /** + * Consumes an {@link AirbyteMessage} for processing. + * + * If the provided JSON string is invalid AND represents a {@link AirbyteMessage.Type#STATE} + * message, processing is halted. Otherwise, the invalid message is logged and execution continues. + * + * @param consumer An {@link AirbyteMessageConsumer} that can handle the provided message. + * @param inputString JSON representation of an {@link AirbyteMessage}. + * @throws Exception if an invalid state message is provided or the consumer is unable to accept the + * provided message. + */ + @VisibleForTesting + static void consumeMessage(final AirbyteMessageConsumer consumer, final String inputString) throws Exception { + + final Optional messageOptional = Jsons.tryDeserialize(inputString, AirbyteMessage.class); + if (messageOptional.isPresent()) { + consumer.accept(messageOptional.get()); + } else { + if (isStateMessage(inputString)) { + throw new IllegalStateException("Invalid state message: " + inputString); + } else { + LOGGER.error("Received invalid message: " + inputString); + } + } + } + + private static String dumpThread(final Thread thread) { + return String.format("%s (%s)\n Thread stacktrace: %s", thread.getName(), thread.getState(), + Strings.join(List.of(thread.getStackTrace()), "\n at ")); + } + + private static void validateConfig(final JsonNode schemaJson, final JsonNode objectJson, final String operationType) throws Exception { + final Set validationResult = validator.validate(schemaJson, objectJson); + if (!validationResult.isEmpty()) { + throw new Exception(String.format("Verification error(s) occurred for %s. Errors: %s ", + operationType, validationResult)); + } + } + + private static JsonNode parseConfig(final Path path) { + return Jsons.deserialize(IOs.readFile(path)); + } + + private static T parseConfig(final Path path, final Class klass) { + final JsonNode jsonNode = parseConfig(path); + return Jsons.object(jsonNode, klass); + } + + /** + * @param connectorImage Expected format: [organization/]image[:version] + */ + @VisibleForTesting + static String parseConnectorVersion(final String connectorImage) { + if (connectorImage == null || connectorImage.equals("")) { + return "unknown"; + } + + final String[] tokens = connectorImage.split(":"); + return tokens[tokens.length - 1]; + } + + /** + * Tests whether the provided JSON string represents a state message. + * + * @param input a JSON string that represents an {@link AirbyteMessage}. + * @return {@code true} if the message is a state message, {@code false} otherwise. + */ + private static boolean isStateMessage(final String input) { + final Optional deserialized = Jsons.tryDeserialize(input, AirbyteTypeMessage.class); + if (deserialized.isPresent()) { + return deserialized.get().getType() == Type.STATE; + } else { + return false; + } + } + + /** + * Custom class that can be used to parse a JSON message to determine the type of the represented + * {@link AirbyteMessage}. + */ + private static class AirbyteTypeMessage { + + @JsonProperty("type") + @JsonPropertyDescription("Message type") + private AirbyteMessage.Type type; + + @JsonProperty("type") + public AirbyteMessage.Type getType() { + return type; + } + + @JsonProperty("type") + public void setType(final AirbyteMessage.Type type) { + this.type = type; + } + + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/JavaBaseConstants.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/JavaBaseConstants.java new file mode 100644 index 0000000000000..4b3a4896dc4a2 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/JavaBaseConstants.java @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +public final class JavaBaseConstants { + + private JavaBaseConstants() {} + + public static final String ARGS_CONFIG_KEY = "config"; + public static final String ARGS_CATALOG_KEY = "catalog"; + public static final String ARGS_STATE_KEY = "state"; + + public static final String ARGS_CONFIG_DESC = "path to the json configuration file"; + public static final String ARGS_CATALOG_DESC = "input path for the catalog"; + public static final String ARGS_PATH_DESC = "path to the json-encoded state file"; + + public static final String COLUMN_NAME_AB_ID = "_airbyte_ab_id"; + public static final String COLUMN_NAME_EMITTED_AT = "_airbyte_emitted_at"; + public static final String COLUMN_NAME_DATA = "_airbyte_data"; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Source.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Source.java new file mode 100644 index 0000000000000..f391ed2d23471 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/Source.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.protocol.models.v0.AirbyteCatalog; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; + +public interface Source extends Integration { + + /** + * Discover the current schema in the source. + * + * @param config - integration-specific configuration object as json. e.g. { "username": "airbyte", + * "password": "super secure" } + * @return Description of the schema. + * @throws Exception - any exception. + */ + AirbyteCatalog discover(JsonNode config) throws Exception; + + /** + * Return a iterator of messages pulled from the source. + * + * @param config - integration-specific configuration object as json. e.g. { "username": "airbyte", + * "password": "super secure" } + * @param catalog - schema of the incoming messages. + * @param state - state of the incoming messages. + * @return {@link AutoCloseableIterator} that produces message. The iterator will be consumed until + * no records remain or until an exception is thrown. {@link AutoCloseableIterator#close()} + * will always be called once regardless of success or failure. + * @throws Exception - any exception. + */ + AutoCloseableIterator read(JsonNode config, ConfiguredAirbyteCatalog catalog, JsonNode state) throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/adaptive/AdaptiveDestinationRunner.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/adaptive/AdaptiveDestinationRunner.java new file mode 100644 index 0000000000000..a6b512537e281 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/adaptive/AdaptiveDestinationRunner.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.adaptive; + +import io.airbyte.integrations.base.Destination; +import io.airbyte.integrations.base.IntegrationRunner; +import java.util.function.Supplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class launches different variants of a destination connector based on where Airbyte is + * deployed. + */ +public class AdaptiveDestinationRunner { + + private static final Logger LOGGER = LoggerFactory.getLogger(AdaptiveDestinationRunner.class); + + private static final String DEPLOYMENT_MODE_KEY = "DEPLOYMENT_MODE"; + private static final String CLOUD_MODE = "CLOUD"; + + public static OssDestinationBuilder baseOnEnv() { + final String mode = System.getenv(DEPLOYMENT_MODE_KEY); + return new OssDestinationBuilder(mode); + } + + public static final class OssDestinationBuilder { + + private final String deploymentMode; + + private OssDestinationBuilder(final String deploymentMode) { + this.deploymentMode = deploymentMode; + } + + public CloudDestinationBuilder withOssDestination(final Supplier ossDestinationSupplier) { + return new CloudDestinationBuilder<>(deploymentMode, ossDestinationSupplier); + } + + } + + public static final class CloudDestinationBuilder { + + private final String deploymentMode; + private final Supplier ossDestinationSupplier; + + public CloudDestinationBuilder(final String deploymentMode, final Supplier ossDestinationSupplier) { + this.deploymentMode = deploymentMode; + this.ossDestinationSupplier = ossDestinationSupplier; + } + + public Runner withCloudDestination(final Supplier cloudDestinationSupplier) { + return new Runner<>(deploymentMode, ossDestinationSupplier, cloudDestinationSupplier); + } + + } + + public static final class Runner { + + private final String deploymentMode; + private final Supplier ossDestinationSupplier; + private final Supplier cloudDestinationSupplier; + + public Runner(final String deploymentMode, + final Supplier ossDestinationSupplier, + final Supplier cloudDestinationSupplier) { + this.deploymentMode = deploymentMode; + this.ossDestinationSupplier = ossDestinationSupplier; + this.cloudDestinationSupplier = cloudDestinationSupplier; + } + + private Destination getDestination() { + LOGGER.info("Running destination under deployment mode: {}", deploymentMode); + if (deploymentMode != null && deploymentMode.equals(CLOUD_MODE)) { + return cloudDestinationSupplier.get(); + } + if (deploymentMode == null) { + LOGGER.warn("Deployment mode is null, default to OSS mode"); + } + return ossDestinationSupplier.get(); + } + + public void run(final String[] args) throws Exception { + final Destination destination = getDestination(); + LOGGER.info("Starting destination: {}", destination.getClass().getName()); + new IntegrationRunner(destination).run(args); + LOGGER.info("Completed destination: {}", destination.getClass().getName()); + } + + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/adaptive/AdaptiveSourceRunner.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/adaptive/AdaptiveSourceRunner.java new file mode 100644 index 0000000000000..82f8026791baf --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/adaptive/AdaptiveSourceRunner.java @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.adaptive; + +import io.airbyte.integrations.base.IntegrationRunner; +import io.airbyte.integrations.base.Source; +import java.util.function.Supplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class launches different variants of a source connector based on where Airbyte is deployed. + */ +public class AdaptiveSourceRunner { + + private static final Logger LOGGER = LoggerFactory.getLogger(AdaptiveSourceRunner.class); + + private static final String DEPLOYMENT_MODE_KEY = "DEPLOYMENT_MODE"; + private static final String COULD_MODE = "CLOUD"; + + public static OssSourceBuilder baseOnEnv() { + final String mode = System.getenv(DEPLOYMENT_MODE_KEY); + return new OssSourceBuilder(mode); + } + + public static final class OssSourceBuilder { + + private final String deploymentMode; + + private OssSourceBuilder(final String deploymentMode) { + this.deploymentMode = deploymentMode; + } + + public CloudSourceBuilder withOssSource(final Supplier ossSourceSupplier) { + return new CloudSourceBuilder<>(deploymentMode, ossSourceSupplier); + } + + } + + public static final class CloudSourceBuilder { + + private final String deploymentMode; + private final Supplier ossSourceSupplier; + + public CloudSourceBuilder(final String deploymentMode, final Supplier ossSourceSupplier) { + this.deploymentMode = deploymentMode; + this.ossSourceSupplier = ossSourceSupplier; + } + + public Runner withCloudSource(final Supplier cloudSourceSupplier) { + return new Runner<>(deploymentMode, ossSourceSupplier, cloudSourceSupplier); + } + + } + + public static final class Runner { + + private final String deploymentMode; + private final Supplier ossSourceSupplier; + private final Supplier cloudSourceSupplier; + + public Runner(final String deploymentMode, + final Supplier ossSourceSupplier, + final Supplier cloudSourceSupplier) { + this.deploymentMode = deploymentMode; + this.ossSourceSupplier = ossSourceSupplier; + this.cloudSourceSupplier = cloudSourceSupplier; + } + + private Source getSource() { + LOGGER.info("Running source under deployment mode: {}", deploymentMode); + if (deploymentMode != null && deploymentMode.equals(COULD_MODE)) { + return cloudSourceSupplier.get(); + } + if (deploymentMode == null) { + LOGGER.warn("Deployment mode is null, default to OSS mode"); + } + return ossSourceSupplier.get(); + } + + public void run(final String[] args) throws Exception { + final Source source = getSource(); + LOGGER.info("Starting source: {}", source.getClass().getName()); + new IntegrationRunner(source).run(args); + LOGGER.info("Completed source: {}", source.getClass().getName()); + } + + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/errors/messages/ErrorMessage.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/errors/messages/ErrorMessage.java new file mode 100644 index 0000000000000..82c643035d252 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/errors/messages/ErrorMessage.java @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.errors.messages; + +import java.util.Objects; + +public class ErrorMessage { + + // TODO: this could be built using a Builder design pattern instead of passing in 0 to indicate no + // errorCode exists + public static String getErrorMessage(final String stateCode, final int errorCode, final String message, final Exception exception) { + if (Objects.isNull(message)) { + return configMessage(stateCode, 0, exception.getMessage()); + } else { + return configMessage(stateCode, errorCode, message); + } + } + + private static String configMessage(final String stateCode, final int errorCode, final String message) { + final String stateCodePart = Objects.isNull(stateCode) ? "" : String.format("State code: %s; ", stateCode); + final String errorCodePart = errorCode == 0 ? "" : String.format("Error code: %s; ", errorCode); + return String.format("%s%sMessage: %s", stateCodePart, errorCodePart, message); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/spec_modification/SpecModifyingDestination.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/spec_modification/SpecModifyingDestination.java new file mode 100644 index 0000000000000..b06f5dab188d6 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/spec_modification/SpecModifyingDestination.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.spec_modification; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.base.Destination; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConnectorSpecification; +import java.util.function.Consumer; + +public abstract class SpecModifyingDestination implements Destination { + + private final Destination destination; + + public SpecModifyingDestination(final Destination destination) { + this.destination = destination; + } + + public abstract ConnectorSpecification modifySpec(ConnectorSpecification originalSpec) throws Exception; + + @Override + public ConnectorSpecification spec() throws Exception { + return modifySpec(destination.spec()); + } + + @Override + public AirbyteConnectionStatus check(final JsonNode config) throws Exception { + return destination.check(config); + } + + @Override + public AirbyteMessageConsumer getConsumer(final JsonNode config, + final ConfiguredAirbyteCatalog catalog, + final Consumer outputRecordCollector) + throws Exception { + return destination.getConsumer(config, catalog, outputRecordCollector); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/spec_modification/SpecModifyingSource.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/spec_modification/SpecModifyingSource.java new file mode 100644 index 0000000000000..f7cfef4df5af9 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/spec_modification/SpecModifyingSource.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.spec_modification; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.integrations.base.Source; +import io.airbyte.protocol.models.v0.AirbyteCatalog; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConnectorSpecification; + +/** + * In some cases we want to prune or mutate the spec for an existing source. The common case is that + * we want to remove features that are not appropriate for some reason. e.g. In cloud, we do not + * want to allow users to send data unencrypted. + */ +public abstract class SpecModifyingSource implements Source { + + private final Source source; + + public SpecModifyingSource(final Source source) { + this.source = source; + } + + public abstract ConnectorSpecification modifySpec(ConnectorSpecification originalSpec) throws Exception; + + @Override + public ConnectorSpecification spec() throws Exception { + return modifySpec(source.spec()); + } + + @Override + public AirbyteConnectionStatus check(final JsonNode config) throws Exception { + return source.check(config); + } + + @Override + public AirbyteCatalog discover(final JsonNode config) throws Exception { + return source.discover(config); + } + + @Override + public AutoCloseableIterator read(final JsonNode config, final ConfiguredAirbyteCatalog catalog, final JsonNode state) + throws Exception { + return source.read(config, catalog, state); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshBastionContainer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshBastionContainer.java new file mode 100644 index 0000000000000..4a32e6e43eff7 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshBastionContainer.java @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.ssh; + +import static io.airbyte.integrations.base.ssh.SshHelpers.getInnerContainerAddress; +import static io.airbyte.integrations.base.ssh.SshHelpers.getOuterContainerAddress; +import static io.airbyte.integrations.base.ssh.SshTunnel.TunnelMethod.SSH_KEY_AUTH; +import static io.airbyte.integrations.base.ssh.SshTunnel.TunnelMethod.SSH_PASSWORD_AUTH; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; +import io.airbyte.commons.json.Jsons; +import java.io.IOException; +import java.util.List; +import java.util.Objects; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.JdbcDatabaseContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.images.builder.ImageFromDockerfile; + +public class SshBastionContainer { + + private static final String SSH_USER = "sshuser"; + private static final String SSH_PASSWORD = "secret"; + private GenericContainer bastion; + + public void initAndStartBastion(final Network network) { + bastion = new GenericContainer( + new ImageFromDockerfile("bastion-test") + .withFileFromClasspath("Dockerfile", "bastion/Dockerfile")) + .withNetwork(network) + .withExposedPorts(22); + bastion.start(); + } + + public JsonNode getTunnelConfig(final SshTunnel.TunnelMethod tunnelMethod, + final ImmutableMap.Builder builderWithSchema, + final boolean innerAddress) + throws IOException, InterruptedException { + final var containerAddress = innerAddress ? getInnerContainerAddress(bastion) : getOuterContainerAddress(bastion); + return Jsons.jsonNode(builderWithSchema + .put("tunnel_method", Jsons.jsonNode(ImmutableMap.builder() + .put("tunnel_host", + Objects.requireNonNull(containerAddress.left)) + .put("tunnel_method", tunnelMethod) + .put("tunnel_port", containerAddress.right) + .put("tunnel_user", SSH_USER) + .put("tunnel_user_password", tunnelMethod.equals(SSH_PASSWORD_AUTH) ? SSH_PASSWORD : "") + .put("ssh_key", tunnelMethod.equals(SSH_KEY_AUTH) ? bastion.execInContainer("cat", "var/bastion/id_rsa").getStdout() : "") + .build())) + .build()); + } + + public ImmutableMap.Builder getBasicDbConfigBuider(final JdbcDatabaseContainer db) { + return getBasicDbConfigBuider(db, db.getDatabaseName()); + } + + public ImmutableMap.Builder getBasicDbConfigBuider(final JdbcDatabaseContainer db, final List schemas) { + return getBasicDbConfigBuider(db, db.getDatabaseName()).put("schemas", schemas); + } + + public ImmutableMap.Builder getBasicDbConfigBuider(final JdbcDatabaseContainer db, final String schemaName) { + return ImmutableMap.builder() + .put("host", Objects.requireNonNull(db.getContainerInfo().getNetworkSettings() + .getNetworks() + .entrySet().stream().findFirst().get().getValue().getIpAddress())) + .put("username", db.getUsername()) + .put("password", db.getPassword()) + .put("port", db.getExposedPorts().get(0)) + .put("database", schemaName) + .put("ssl", false); + } + + public void stopAndCloseContainers(final JdbcDatabaseContainer db) { + bastion.stop(); + bastion.close(); + db.stop(); + db.close(); + } + + public void stopAndClose() { + bastion.close(); + } + + public GenericContainer getContainer() { + return bastion; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshHelpers.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshHelpers.java new file mode 100644 index 0000000000000..e1446c34c0a11 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshHelpers.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.ssh; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.resources.MoreResources; +import io.airbyte.protocol.models.v0.ConnectorSpecification; +import java.io.IOException; +import java.util.Optional; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.testcontainers.containers.Container; + +public class SshHelpers { + + public static ConnectorSpecification getSpecAndInjectSsh() throws IOException { + return getSpecAndInjectSsh(Optional.empty()); + } + + public static ConnectorSpecification getSpecAndInjectSsh(final Optional group) throws IOException { + final ConnectorSpecification originalSpec = Jsons.deserialize(MoreResources.readResource("spec.json"), ConnectorSpecification.class); + return injectSshIntoSpec(originalSpec, group); + } + + public static ConnectorSpecification injectSshIntoSpec(final ConnectorSpecification connectorSpecification) throws IOException { + return injectSshIntoSpec(connectorSpecification, Optional.empty()); + } + + public static ConnectorSpecification injectSshIntoSpec(final ConnectorSpecification connectorSpecification, final Optional group) + throws IOException { + final ConnectorSpecification originalSpec = Jsons.clone(connectorSpecification); + final ObjectNode propNode = (ObjectNode) originalSpec.getConnectionSpecification().get("properties"); + final ObjectNode tunnelMethod = (ObjectNode) Jsons.deserialize(MoreResources.readResource("ssh-tunnel-spec.json")); + if (group.isPresent()) { + tunnelMethod.put("group", group.get()); + } + propNode.set("tunnel_method", tunnelMethod); + return originalSpec; + } + + /** + * Returns the inner docker network ip address and port of a container. This can be used to reach a + * container from another container running on the same network + * + * @param container container + * @return a pair of host and port + */ + public static ImmutablePair getInnerContainerAddress(final Container container) { + return ImmutablePair.of( + container.getContainerInfo().getNetworkSettings().getNetworks().entrySet().stream().findFirst().get().getValue().getIpAddress(), + (Integer) container.getExposedPorts().stream().findFirst().get()); + } + + /** + * Returns the outer docker network ip address and port of a container. This can be used to reach a + * container from the host machine + * + * @param container container + * @return a pair of host and port + */ + public static ImmutablePair getOuterContainerAddress(final Container container) { + return ImmutablePair.of(container.getHost(), + container.getFirstMappedPort()); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshTunnel.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshTunnel.java new file mode 100644 index 0000000000000..77e4937df8c57 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshTunnel.java @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.ssh; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.base.Preconditions; +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.commons.functional.CheckedConsumer; +import io.airbyte.commons.functional.CheckedFunction; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.string.Strings; +import io.airbyte.integrations.base.AirbyteTraceMessageUtility; +import java.io.IOException; +import java.io.StringReader; +import java.net.InetSocketAddress; +import java.net.MalformedURLException; +import java.net.URL; +import java.security.GeneralSecurityException; +import java.security.KeyPair; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import org.apache.sshd.client.SshClient; +import org.apache.sshd.client.keyverifier.AcceptAllServerKeyVerifier; +import org.apache.sshd.client.session.ClientSession; +import org.apache.sshd.common.SshException; +import org.apache.sshd.common.util.net.SshdSocketAddress; +import org.apache.sshd.common.util.security.SecurityUtils; +import org.apache.sshd.core.CoreModuleProperties; +import org.apache.sshd.server.forward.AcceptAllForwardingFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// todo (cgardens) - this needs unit tests. it is currently tested transitively via source postgres +// integration tests. +/** + * Encapsulates the connection configuration for an ssh tunnel port forward through a proxy/bastion + * host plus the remote host and remote port to forward to a specified local port. + */ +public class SshTunnel implements AutoCloseable { + + private static final Logger LOGGER = LoggerFactory.getLogger(SshTunnel.class); + public static final String SSH_TIMEOUT_DISPLAY_MESSAGE = + "Timed out while opening a SSH Tunnel. Please double check the given SSH configurations and try again."; + + public enum TunnelMethod { + NO_TUNNEL, + SSH_PASSWORD_AUTH, + SSH_KEY_AUTH + } + + public static final int TIMEOUT_MILLIS = 15000; // 15 seconds + + private final JsonNode config; + private final List hostKey; + private final List portKey; + + private final TunnelMethod tunnelMethod; + private final String tunnelHost; + private final int tunnelPort; + private final String tunnelUser; + private final String sshKey; + private final String endPointKey; + private final String remoteServiceProtocol; + private final String remoteServicePath; + private final String tunnelUserPassword; + private final String remoteServiceHost; + private final int remoteServicePort; + protected int tunnelLocalPort; + + private SshClient sshclient; + private ClientSession tunnelSession; + + /** + * + * @param config - the full config that was passed to the source. + * @param hostKey - a list of keys that point to the database host name. should be pointing to where + * in the config remoteDatabaseHost is found. + * @param portKey - a list of keys that point to the database port. should be pointing to where in + * the config remoteDatabasePort is found. + * @param endPointKey - key that points to the endpoint URL (this is commonly used for REST-based + * services such as Elastic and MongoDB) + * @param remoteServiceUrl - URL of the remote endpoint (this is commonly used for REST-based * + * services such as Elastic and MongoDB) + * @param tunnelMethod - the type of ssh method that should be used (includes not using SSH at all). + * @param tunnelHost - host name of the machine to which we will establish an ssh connection (e.g. + * hostname of the bastion). + * @param tunnelPort - port of the machine to which we will establish an ssh connection. (e.g. port + * of the bastion). + * @param tunnelUser - user that is allowed to access the tunnelHost. + * @param sshKey - the ssh key that will be used to make the ssh connection. can be null if we are + * using tunnelUserPassword instead. + * @param tunnelUserPassword - the password for the tunnelUser. can be null if we are using sshKey + * instead. + * @param remoteServiceHost - the actual host name of the remote service (as it is known to the + * tunnel host). + * @param remoteServicePort - the actual port of the remote service (as it is known to the tunnel + * host). + */ + public SshTunnel(final JsonNode config, + final List hostKey, + final List portKey, + final String endPointKey, + final String remoteServiceUrl, + final TunnelMethod tunnelMethod, + final String tunnelHost, + final int tunnelPort, + final String tunnelUser, + final String sshKey, + final String tunnelUserPassword, + final String remoteServiceHost, + final int remoteServicePort) { + this.config = config; + this.hostKey = hostKey; + this.portKey = portKey; + this.endPointKey = endPointKey; + Preconditions.checkNotNull(tunnelMethod); + this.tunnelMethod = tunnelMethod; + + if (tunnelMethod.equals(TunnelMethod.NO_TUNNEL)) { + this.tunnelHost = null; + this.tunnelPort = 0; + this.tunnelUser = null; + this.sshKey = null; + this.tunnelUserPassword = null; + this.remoteServiceHost = null; + this.remoteServicePort = 0; + this.remoteServiceProtocol = null; + this.remoteServicePath = null; + } else { + Preconditions.checkNotNull(tunnelHost); + Preconditions.checkArgument(tunnelPort > 0); + Preconditions.checkNotNull(tunnelUser); + if (tunnelMethod.equals(TunnelMethod.SSH_KEY_AUTH)) { + Preconditions.checkNotNull(sshKey); + } + if (tunnelMethod.equals(TunnelMethod.SSH_PASSWORD_AUTH)) { + Preconditions.checkNotNull(tunnelUserPassword); + } + // must provide either host/port or endpoint + Preconditions.checkArgument((hostKey != null && portKey != null) || endPointKey != null); + Preconditions.checkArgument((remoteServiceHost != null && remoteServicePort > 0) || remoteServiceUrl != null); + if (remoteServiceUrl != null) { + URL urlObject = null; + try { + urlObject = new URL(remoteServiceUrl); + } catch (final MalformedURLException e) { + AirbyteTraceMessageUtility.emitConfigErrorTrace(e, + String.format("Provided value for remote service URL is not valid: %s", remoteServiceUrl)); + } + Preconditions.checkNotNull(urlObject, "Failed to parse URL of remote service"); + this.remoteServiceHost = urlObject.getHost(); + this.remoteServicePort = urlObject.getPort(); + this.remoteServiceProtocol = urlObject.getProtocol(); + this.remoteServicePath = urlObject.getPath(); + } else { + this.remoteServiceProtocol = null; + this.remoteServicePath = null; + this.remoteServiceHost = remoteServiceHost; + this.remoteServicePort = remoteServicePort; + } + + this.tunnelHost = tunnelHost; + this.tunnelPort = tunnelPort; + this.tunnelUser = tunnelUser; + this.sshKey = sshKey; + this.tunnelUserPassword = tunnelUserPassword; + this.sshclient = createClient(); + this.tunnelSession = openTunnel(sshclient); + } + } + + public JsonNode getOriginalConfig() { + return config; + } + + public JsonNode getConfigInTunnel() throws Exception { + if (tunnelMethod.equals(TunnelMethod.NO_TUNNEL)) { + return getOriginalConfig(); + } else { + final JsonNode clone = Jsons.clone(config); + if (hostKey != null) { + Jsons.replaceNestedString(clone, hostKey, SshdSocketAddress.LOCALHOST_ADDRESS.getHostName()); + } + if (portKey != null) { + Jsons.replaceNestedInt(clone, portKey, tunnelLocalPort); + } + if (endPointKey != null) { + final URL tunnelEndPointURL = + new URL(remoteServiceProtocol, SshdSocketAddress.LOCALHOST_ADDRESS.getHostName(), tunnelLocalPort, remoteServicePath); + Jsons.replaceNestedString(clone, Arrays.asList(endPointKey), tunnelEndPointURL.toString()); + } + return clone; + } + } + + public static SshTunnel getInstance(final JsonNode config, final List hostKey, final List portKey) { + final TunnelMethod tunnelMethod = Jsons.getOptional(config, "tunnel_method", "tunnel_method") + .map(method -> TunnelMethod.valueOf(method.asText().trim())) + .orElse(TunnelMethod.NO_TUNNEL); + LOGGER.info("Starting connection with method: {}", tunnelMethod); + + return new SshTunnel( + config, + hostKey, + portKey, + null, + null, + tunnelMethod, + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "tunnel_host")), + Jsons.getIntOrZero(config, "tunnel_method", "tunnel_port"), + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "tunnel_user")), + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "ssh_key")), + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "tunnel_user_password")), + Strings.safeTrim(Jsons.getStringOrNull(config, hostKey)), + Jsons.getIntOrZero(config, portKey)); + } + + public static SshTunnel getInstance(final JsonNode config, final String endPointKey) throws Exception { + final TunnelMethod tunnelMethod = Jsons.getOptional(config, "tunnel_method", "tunnel_method") + .map(method -> TunnelMethod.valueOf(method.asText().trim())) + .orElse(TunnelMethod.NO_TUNNEL); + LOGGER.info("Starting connection with method: {}", tunnelMethod); + + return new SshTunnel( + config, + null, + null, + endPointKey, + Jsons.getStringOrNull(config, endPointKey), + tunnelMethod, + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "tunnel_host")), + Jsons.getIntOrZero(config, "tunnel_method", "tunnel_port"), + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "tunnel_user")), + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "ssh_key")), + Strings.safeTrim(Jsons.getStringOrNull(config, "tunnel_method", "tunnel_user_password")), + null, 0); + } + + public static void sshWrap(final JsonNode config, + final List hostKey, + final List portKey, + final CheckedConsumer wrapped) + throws Exception { + sshWrap(config, hostKey, portKey, (configInTunnel) -> { + wrapped.accept(configInTunnel); + return null; + }); + } + + public static void sshWrap(final JsonNode config, + final String endPointKey, + final CheckedConsumer wrapped) + throws Exception { + sshWrap(config, endPointKey, (configInTunnel) -> { + wrapped.accept(configInTunnel); + return null; + }); + } + + public static T sshWrap(final JsonNode config, + final List hostKey, + final List portKey, + final CheckedFunction wrapped) + throws Exception { + try (final SshTunnel sshTunnel = SshTunnel.getInstance(config, hostKey, portKey)) { + return wrapped.apply(sshTunnel.getConfigInTunnel()); + } + } + + public static T sshWrap(final JsonNode config, + final String endPointKey, + final CheckedFunction wrapped) + throws Exception { + try (final SshTunnel sshTunnel = SshTunnel.getInstance(config, endPointKey)) { + return wrapped.apply(sshTunnel.getConfigInTunnel()); + } + } + + /** + * Closes a tunnel if one was open, and otherwise doesn't do anything (safe to run). + */ + @Override + public void close() { + try { + if (tunnelSession != null) { + tunnelSession.close(); + tunnelSession = null; + } + if (sshclient != null) { + sshclient.stop(); + sshclient = null; + } + } catch (final Throwable t) { + throw new RuntimeException(t); + } + } + + /** + * From the OPENSSH private key string, use mina-sshd to deserialize the key pair, reconstruct the + * keys from the key info, and return the key pair for use in authentication. + * + * @return The {@link KeyPair} to add - may not be {@code null} + * @see loadKeyPairs() + */ + KeyPair getPrivateKeyPair() throws IOException, GeneralSecurityException { + final String validatedKey = validateKey(); + final var keyPairs = SecurityUtils + .getKeyPairResourceParser() + .loadKeyPairs(null, null, null, new StringReader(validatedKey)); + + if (keyPairs != null && keyPairs.iterator().hasNext()) { + return keyPairs.iterator().next(); + } + throw new ConfigErrorException("Unable to load private key pairs, verify key pairs are properly inputted"); + } + + private String validateKey() { + return sshKey.replace("\\n", "\n"); + } + + /** + * Generates a new ssh client and returns it, with forwarding set to accept all types; use this + * before opening a tunnel. + */ + private SshClient createClient() { + java.security.Security.addProvider(new org.bouncycastle.jce.provider.BouncyCastleProvider()); + final SshClient client = SshClient.setUpDefaultClient(); + client.setForwardingFilter(AcceptAllForwardingFilter.INSTANCE); + client.setServerKeyVerifier(AcceptAllServerKeyVerifier.INSTANCE); + CoreModuleProperties.IDLE_TIMEOUT.set(client, Duration.ZERO); + return client; + } + + /** + * Starts an ssh session; wrap this in a try-finally and use closeTunnel() to close it. + */ + ClientSession openTunnel(final SshClient client) { + try { + client.start(); + final ClientSession session = client.connect( + tunnelUser.trim(), + tunnelHost.trim(), + tunnelPort) + .verify(TIMEOUT_MILLIS) + .getSession(); + if (tunnelMethod.equals(TunnelMethod.SSH_KEY_AUTH)) { + session.addPublicKeyIdentity(getPrivateKeyPair()); + } + if (tunnelMethod.equals(TunnelMethod.SSH_PASSWORD_AUTH)) { + session.addPasswordIdentity(tunnelUserPassword); + } + + session.auth().verify(TIMEOUT_MILLIS); + final SshdSocketAddress address = session.startLocalPortForwarding( + // entering 0 lets the OS pick a free port for us. + new SshdSocketAddress(InetSocketAddress.createUnresolved(SshdSocketAddress.LOCALHOST_ADDRESS.getHostName(), 0)), + new SshdSocketAddress(remoteServiceHost, remoteServicePort)); + + // discover the port that the OS picked and remember it so that we can use it when we try to connect + tunnelLocalPort = address.getPort(); + + LOGGER.info(String.format("Established tunneling session to %s:%d. Port forwarding started on %s ", + remoteServiceHost, remoteServicePort, address.toInetSocketAddress())); + return session; + } catch (final IOException | GeneralSecurityException e) { + if (e instanceof SshException && e.getMessage() + .toLowerCase(Locale.ROOT) + .contains("failed to get operation result within specified timeout")) { + throw new ConfigErrorException(SSH_TIMEOUT_DISPLAY_MESSAGE, e); + } else { + throw new RuntimeException(e); + } + } + } + + @Override + public String toString() { + return "SshTunnel{" + + "hostKey=" + hostKey + + ", portKey=" + portKey + + ", tunnelMethod=" + tunnelMethod + + ", tunnelHost='" + tunnelHost + '\'' + + ", tunnelPort=" + tunnelPort + + ", tunnelUser='" + tunnelUser + '\'' + + ", remoteServiceHost='" + remoteServiceHost + '\'' + + ", remoteServicePort=" + remoteServicePort + + ", tunnelLocalPort=" + tunnelLocalPort + + '}'; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshWrappedDestination.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshWrappedDestination.java new file mode 100644 index 0000000000000..954bd58d4c8f3 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshWrappedDestination.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.ssh; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.resources.MoreResources; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.base.AirbyteTraceMessageUtility; +import io.airbyte.integrations.base.Destination; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus.Status; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConnectorSpecification; +import java.util.List; +import java.util.function.Consumer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Decorates a Destination with an SSH Tunnel using the standard configuration that Airbyte uses for + * configuring SSH. + */ +public class SshWrappedDestination implements Destination { + + private static final Logger LOGGER = LoggerFactory.getLogger(SshWrappedDestination.class); + + private final Destination delegate; + private final List hostKey; + private final List portKey; + private final String endPointKey; + + public SshWrappedDestination(final Destination delegate, + final List hostKey, + final List portKey) { + this.delegate = delegate; + this.hostKey = hostKey; + this.portKey = portKey; + this.endPointKey = null; + } + + public SshWrappedDestination(final Destination delegate, + final String endPointKey) { + this.delegate = delegate; + this.endPointKey = endPointKey; + this.portKey = null; + this.hostKey = null; + } + + @Override + public ConnectorSpecification spec() throws Exception { + // inject the standard ssh configuration into the spec. + final ConnectorSpecification originalSpec = delegate.spec(); + final ObjectNode propNode = (ObjectNode) originalSpec.getConnectionSpecification().get("properties"); + propNode.set("tunnel_method", Jsons.deserialize(MoreResources.readResource("ssh-tunnel-spec.json"))); + return originalSpec; + } + + @Override + public AirbyteConnectionStatus check(final JsonNode config) throws Exception { + try { + return (endPointKey != null) ? SshTunnel.sshWrap(config, endPointKey, delegate::check) + : SshTunnel.sshWrap(config, hostKey, portKey, delegate::check); + } catch (final RuntimeException e) { + final String sshErrorMessage = "Could not connect with provided SSH configuration. Error: " + e.getMessage(); + AirbyteTraceMessageUtility.emitConfigErrorTrace(e, sshErrorMessage); + return new AirbyteConnectionStatus() + .withStatus(Status.FAILED) + .withMessage(sshErrorMessage); + } + } + + @Override + public AirbyteMessageConsumer getConsumer(final JsonNode config, + final ConfiguredAirbyteCatalog catalog, + final Consumer outputRecordCollector) + throws Exception { + final SshTunnel tunnel = (endPointKey != null) ? SshTunnel.getInstance(config, endPointKey) : SshTunnel.getInstance(config, hostKey, portKey); + + final AirbyteMessageConsumer delegateConsumer; + try { + delegateConsumer = delegate.getConsumer(tunnel.getConfigInTunnel(), catalog, outputRecordCollector); + } catch (final Exception e) { + LOGGER.error("Exception occurred while getting the delegate consumer, closing SSH tunnel", e); + tunnel.close(); + throw e; + } + return AirbyteMessageConsumer.appendOnClose(delegateConsumer, tunnel::close); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshWrappedSource.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshWrappedSource.java new file mode 100644 index 0000000000000..bb3b7de21fe2e --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/SshWrappedSource.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.ssh; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.util.AutoCloseableIterator; +import io.airbyte.commons.util.AutoCloseableIterators; +import io.airbyte.integrations.base.AirbyteTraceMessageUtility; +import io.airbyte.integrations.base.Source; +import io.airbyte.protocol.models.v0.AirbyteCatalog; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus.Status; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConnectorSpecification; +import java.util.List; +import java.util.Optional; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SshWrappedSource implements Source { + + private static final Logger LOGGER = LoggerFactory.getLogger(SshWrappedSource.class); + private final Source delegate; + private final List hostKey; + private final List portKey; + private final Optional sshGroup; + + public SshWrappedSource(final Source delegate, final List hostKey, final List portKey) { + this.delegate = delegate; + this.hostKey = hostKey; + this.portKey = portKey; + this.sshGroup = Optional.empty(); + } + + public SshWrappedSource(final Source delegate, final List hostKey, final List portKey, final String sshGroup) { + this.delegate = delegate; + this.hostKey = hostKey; + this.portKey = portKey; + this.sshGroup = Optional.of(sshGroup); + } + + @Override + public ConnectorSpecification spec() throws Exception { + return SshHelpers.injectSshIntoSpec(delegate.spec(), sshGroup); + } + + @Override + public AirbyteConnectionStatus check(final JsonNode config) throws Exception { + try { + return SshTunnel.sshWrap(config, hostKey, portKey, delegate::check); + } catch (final RuntimeException e) { + final String sshErrorMessage = "Could not connect with provided SSH configuration. Error: " + e.getMessage(); + AirbyteTraceMessageUtility.emitConfigErrorTrace(e, sshErrorMessage); + return new AirbyteConnectionStatus() + .withStatus(Status.FAILED) + .withMessage(sshErrorMessage); + } + } + + @Override + public AirbyteCatalog discover(final JsonNode config) throws Exception { + return SshTunnel.sshWrap(config, hostKey, portKey, delegate::discover); + } + + @Override + public AutoCloseableIterator read(final JsonNode config, final ConfiguredAirbyteCatalog catalog, final JsonNode state) + throws Exception { + final SshTunnel tunnel = SshTunnel.getInstance(config, hostKey, portKey); + final AutoCloseableIterator delegateRead; + try { + delegateRead = delegate.read(tunnel.getConfigInTunnel(), catalog, state); + } catch (final Exception e) { + LOGGER.error("Exception occurred while getting the delegate read iterator, closing SSH tunnel", e); + tunnel.close(); + throw e; + } + return AutoCloseableIterators.appendOnClose(delegateRead, tunnel::close); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/readme.md b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/readme.md new file mode 100644 index 0000000000000..ea627045bd3d5 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/base/ssh/readme.md @@ -0,0 +1,63 @@ +# Developing an SSH Connector + +## Goal +Easy development of any connector that needs the ability to connect to a resource via SSH Tunnel. + +## Overview +Our SSH connector support is designed to be easy to plug into any existing connector. There are a few major pieces to consider: +1. Add SSH Configuration to the Spec - for SSH, we need to take in additional configuration, so we need to inject extra fields into the connector configuration. +2. Add SSH Logic to the Connector - before the connector code begins to execute we need to start an SSH tunnel. This library provides logic to create that tunnel (and clean it up). +3. Acceptance Testing - it is a good practice to include acceptance testing for the SSH version of a connector for at least one of the SSH types (password or ssh key). While unit testing for the SSH functionality exists in this package (coming soon), high-level acceptance testing to make sure this feature works with the individual connector belongs in the connector. +4. Normalization Support for Destinations - if the connector is a destination and supports normalization, there's a small change required in the normalization code to update the config so that dbt uses the right credentials for the SSH tunnel. + +## How To + +### Add SSH Configuration to the Spec +1. The `SshHelpers` class provides 2 helper functions that injects the SSH configuration objects into a spec JsonSchema for an existing connector. Usually the `spec()` method for a connector looks like `Jsons.deserialize(MoreResources.readResource("spec.json"), ConnectorSpecification.class);`. These helpers are just injecting the ssh spec (`ssh-tunnel-spec.json`) into that spec. +2. You may need to update tests to reflect that new fields have been added to the spec. Usually updating the tests just requires using these helpers in the tests. + +### Add SSH Logic to the Connector +1. This package provides a Source decorated class to make it easy to add SSH logic to an existing source. Simply pass the source you want to wrap into the constructor of the `SshWrappedSource`. That class also requires two other fields: `hostKey` and `portKey`. Both of these fields are pointers to fields in the connector specification. The `hostKey` is a pointer to the field that hold the host of the resource you want to connect and `portKey` is the port. In a simple case, where the host name for a connector is just defined in the top-level `host` field, then `hostKey` would simply be: `["host"]`. If that field is nested, however, then it might be: `["database", "configuration", "host"]`. + +### Acceptance Testing +1. The only difference between existing acceptance testing and acceptance testing with SSH is that the configuration that is used for testing needs to contain additional fields. You can see the `Postgres Source ssh key creds` in lastpass to see an example of what that might look like. Those credentials leverage an existing bastion host in our test infrastructure. (As future work, we want to get rid of the need to use a static bastion server and instead do it in docker so we can run it all locally.) + +### Normalization Support for Destinations +1. The core functionality for ssh tunnelling with normalization is already in place but you'll need to add a small tweak to `transform_config/transform.py` in the normalization module. Find the function `transform_{connector}()` and add at the start: + ``` + if TransformConfig.is_ssh_tunnelling(config): + config = TransformConfig.get_ssh_altered_config(config, port_key="port", host_key="host") + ``` + Replace port_key and host_key as necessary. Look at `transform_postgres()` to see an example. +2. To make sure your changes are present in Normalization when running tests on the connector locally, you'll need to change [this version tag](https://github.com/airbytehq/airbyte/blob/6d9ba022646441c7f298ca4dcaa3df59b9a19fbb/airbyte-workers/src/main/java/io/airbyte/workers/normalization/DefaultNormalizationRunner.java#L50) to `dev` so that the new locally built docker image for Normalization is used. Don't push this change with the PR though. +3. If your `host_key="host"` and `port_key="port"` then this step is not necessary. However if the key names differ for your connector, you will also need to add some logic into `sshtunneling.sh` (within airbyte-workers) to handle this, as currently it assumes that the keys are exactly `host` and `port`. +4. When making your PR, make sure that you've version bumped Normalization (in `airbyte-workers/src/main/java/io/airbyte/workers/normalization/DefaultNormalizationRunner.java` and `airbyte-integrations/bases/base-normalization/Dockerfile`). You'll need to /test & /publish Normalization _first_ so that when you /test the connector, it can use the new version. + +## Misc + +### How to wrap the protocol in an SSH Tunnel +For `spec()`, `check()`, and `discover()` wrapping the connector in an SSH tunnel is easier to think about because when they return all work is done and the tunnel can be closed. Thus, each of these methods can simply be wrapped in a try-with-resource of the SSH Tunnel. + +For `read()` and `write()` they return an iterator and consumer respectively that perform work that must happen within the SSH Tunnel after the method has returned. Therefore, the `close` function on the iterator and consumer have to handle closing the SSH tunnel; the methods themselves cannot just be wrapped in a try-with-resource. This is handled for you by the `SshWrappedSource`, but if you need to implement any of this manually you must take it into account. + +### Name Mangling +One of the least intuitive pieces of the SSH setup to follow is the replacement of host names and ports. The reason `SshWrappedSource` needs to know how to get the hostname and port of the database you are trying to connect to is that when it builds the SSH tunnel that forwards to the database, it needs to know the hostname and port so that the tunnel forwards requests to the right place. After the SSH tunnel is established and forwarding to the database, the connector code itself runs. + +There's a trick here though! The connector should NOT try to connect to the hostname and port of the database. Instead, it should be trying to connect to `localhost` and whatever port we are forwarding to the database. The `SshTunnel#sshWrap` removes the original host and port from the configuration for the connector and replaces it with `localhost` and the correct port. So from the connector code's point of view it is just operating on localhost. + +There is a tradeoff here. +* (Good) The way we have structured this allows users to configure a connector in the UI in a way that it is intuitive to user. They put in the host and port they think about referring to the database as (they don't need to worry about any of the localhost version). +* (Good) The connector code does not need to know anything about SSH, it can just operate on the host and port it gets (and we let SSH Tunnel handle swapping the names for us) which makes writing a connector easier. +* (Bad) The downside is that the `SshTunnel` logic is more complicated because it is absorbing all of this name swapping so that neither user nor connector developer need to worry about it. In our estimation, the good outweighs the extra complexity incurred here. + + +### Acceptance Testing via ssh tunnel using SshBastion and JdbcDatabaseContainer in Docker +1. The `SshBastion` class provides 3 helper functions: + `initAndStartBastion()`to initialize and start SSH Bastion server in Docker test container and creates new `Network` for bastion and tested jdbc container + `getTunnelConfig()`which return JsoneNode with all necessary configuration to establish ssh tunnel. Connection configuration for integration tests is now taken directly from container settings and does not require a real database connection + `stopAndCloseContainers` to stop and close SshBastion and JdbcDatabaseContainer at the end of the test + +## Future Work +* Add unit / integration testing for `ssh` package. +* Restructure spec so that instead of having `SSH Key Authentication` or `Password Authentication` options for `tunnel_method`, just have an `SSH` option and then within that `SSH` option have a `oneOf` for password or key. This is blocked because we cannot use `oneOf`s nested in `oneOf`s. +* Improve the process of acceptance testing by allowing doing acceptance testing using a bastion running in a docker container instead of having to use dedicated infrastructure and a static database. diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/NamingConventionTransformer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/NamingConventionTransformer.java new file mode 100644 index 0000000000000..89c5d7f64dfb5 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/NamingConventionTransformer.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination; + +/** + * Destination have their own Naming conventions (which characters are valid or rejected in + * identifiers names) This class transform a random string used to a valid identifier names for each + * specific destination. + */ +public interface NamingConventionTransformer { + + /** + * Handle Naming Conversions of an input name to output a valid identifier name for the desired + * destination. + * + * @param name of the identifier to check proper naming conventions + * @return modified name with invalid characters replaced by '_' and adapted for the chosen + * destination. + */ + String getIdentifier(String name); + + /** + * Handle naming conversions of an input name to output a valid namespace for the desired + * destination. + */ + String getNamespace(String namespace); + + /** + * Same as getIdentifier but returns also the name of the table for storing raw data + * + * @param name of the identifier to check proper naming conventions + * @return modified name with invalid characters replaced by '_' and adapted for the chosen + * destination. + * + * @deprecated as this is very SQL specific, prefer using getIdentifier instead + */ + @Deprecated + String getRawTableName(String name); + + /** + * Same as getIdentifier but returns also the name of the table for storing tmp data + * + * @param name of the identifier to check proper naming conventions + * @return modified name with invalid characters replaced by '_' and adapted for the chosen + * destination. + * + * @deprecated as this is very SQL specific, prefer using getIdentifier instead + */ + @Deprecated + String getTmpTableName(String name); + + String convertStreamName(final String input); + + String applyDefaultCase(final String input); + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/StandardNameTransformer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/StandardNameTransformer.java new file mode 100644 index 0000000000000..a2f0b2d0bab64 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/StandardNameTransformer.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.string.Strings; +import io.airbyte.commons.text.Names; +import io.airbyte.commons.util.MoreIterators; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +public class StandardNameTransformer implements NamingConventionTransformer { + + private static final String NON_JSON_PATH_CHARACTERS_PATTERN = "['\"`]"; + + @Override + public String getIdentifier(final String name) { + return convertStreamName(name); + } + + /** + * Most destinations have the same naming requirement for namespace and stream names. + */ + @Override + public String getNamespace(final String namespace) { + return convertStreamName(namespace); + } + + @Override + public String getRawTableName(final String streamName) { + return convertStreamName("_airbyte_raw_" + streamName); + } + + @Override + public String getTmpTableName(final String streamName) { + return convertStreamName(Strings.addRandomSuffix("_airbyte_tmp", "_", 3) + "_" + streamName); + } + + @Override + public String convertStreamName(final String input) { + return Names.toAlphanumericAndUnderscore(input); + } + + @Override + public String applyDefaultCase(final String input) { + return input; + } + + /** + * Rebuild a JsonNode adding sanitized property names (a subset of special characters replaced by + * underscores) while keeping original property names too. This is needed by some destinations as + * their json extract functions have limitations on how such special characters are parsed. These + * naming rules may be different to schema/table/column naming conventions. + */ + public static JsonNode formatJsonPath(final JsonNode root) { + if (root.isObject()) { + final Map properties = new HashMap<>(); + final var keys = Jsons.keys(root); + for (final var key : keys) { + final JsonNode property = root.get(key); + // keep original key + properties.put(key, formatJsonPath(property)); + } + for (final var key : keys) { + final JsonNode property = root.get(key); + final String formattedKey = key.replaceAll(NON_JSON_PATH_CHARACTERS_PATTERN, "_"); + if (!properties.containsKey(formattedKey)) { + // duplicate property in a formatted key to be extracted in normalization + properties.put(formattedKey, formatJsonPath(property)); + } + } + return Jsons.jsonNode(properties); + } else if (root.isArray()) { + return Jsons.jsonNode(MoreIterators.toList(root.elements()).stream() + .map(StandardNameTransformer::formatJsonPath) + .collect(Collectors.toList())); + } else { + return root; + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumer.java new file mode 100644 index 0000000000000..efdbd2a019cdf --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumer.java @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import io.airbyte.commons.functional.CheckedFunction; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.base.FailureTrackingAirbyteMessageConsumer; +import io.airbyte.integrations.destination.dest_state_lifecycle_manager.DefaultDestStateLifecycleManager; +import io.airbyte.integrations.destination.dest_state_lifecycle_manager.DestStateLifecycleManager; +import io.airbyte.integrations.destination.record_buffer.BufferFlushType; +import io.airbyte.integrations.destination.record_buffer.BufferingStrategy; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.time.Duration; +import java.time.Instant; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Consumer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class consumes AirbyteMessages from the worker. + * + *

+ * Record Messages: It adds record messages to a buffer. Under 2 conditions, it will flush the + * records in the buffer to a temporary table in the destination. Condition 1: The buffer fills up + * (the buffer is designed to be small enough as not to exceed the memory of the container). + * Condition 2: On close. + *

+ * + *

+ * State Messages: This consumer tracks the last state message it has accepted. It also tracks the + * last state message that was committed to the temporary table. For now, we only emit a message if + * everything is successful. Once checkpointing is turned on, we will emit the state message as long + * as the onClose successfully commits any messages to the raw table. + *

+ * + *

+ * All other message types are ignored. + *

+ * + *

+ * Throughout the lifecycle of the consumer, messages get promoted from buffered to flushed to + * committed. A record message when it is received is immediately buffered. When the buffer fills + * up, all buffered records are flushed out of memory using the user-provided recordBuffer. When + * this flush happens, a state message is moved from pending to flushed. On close, if the + * user-provided onClose function is successful, then the flushed state record is considered + * committed and is then emitted. We expect this class to only ever emit either 1 state message (in + * the case of a full or partial success) or 0 state messages (in the case where the onClose step + * was never reached or did not complete without exception). + *

+ * + *

+ * When a record is "flushed" it is moved from the docker container to the destination. By + * convention, it is usually placed in some sort of temporary storage on the destination (e.g. a + * temporary database or file store). The logic in close handles committing the temporary + * representation data to the final store (e.g. final table). In the case of staging destinations + * they often have additional temporary stores. The common pattern for staging destination is that + * flush pushes the data into a staging area in cloud storage and then close copies from staging to + * a temporary table AND then copies from the temporary table into the final table. This abstraction + * is blind to the detail of how staging destinations implement their close. + *

+ */ +public class BufferedStreamConsumer extends FailureTrackingAirbyteMessageConsumer implements AirbyteMessageConsumer { + + private static final Logger LOGGER = LoggerFactory.getLogger(BufferedStreamConsumer.class); + + private final OnStartFunction onStart; + private final OnCloseFunction onClose; + private final Set streamNames; + private final ConfiguredAirbyteCatalog catalog; + private final CheckedFunction isValidRecord; + private final Map streamToIgnoredRecordCount; + private final Consumer outputRecordCollector; + private final BufferingStrategy bufferingStrategy; + private final DestStateLifecycleManager stateManager; + + private boolean hasStarted; + private boolean hasClosed; + + private Instant nextFlushDeadline; + private final Duration bufferFlushFrequency; + + public BufferedStreamConsumer(final Consumer outputRecordCollector, + final OnStartFunction onStart, + final BufferingStrategy bufferingStrategy, + final OnCloseFunction onClose, + final ConfiguredAirbyteCatalog catalog, + final CheckedFunction isValidRecord) { + this(outputRecordCollector, + onStart, + bufferingStrategy, + onClose, + catalog, + isValidRecord, + Duration.ofMinutes(15)); + } + + /* + * NOTE: this is only used for testing purposes, future work would be re-visit if #acceptTracked + * should take in an Instant parameter which would require refactoring all MessageConsumers + */ + @VisibleForTesting + BufferedStreamConsumer(final Consumer outputRecordCollector, + final OnStartFunction onStart, + final BufferingStrategy bufferingStrategy, + final OnCloseFunction onClose, + final ConfiguredAirbyteCatalog catalog, + final CheckedFunction isValidRecord, + final Duration flushFrequency) { + this.outputRecordCollector = outputRecordCollector; + this.hasStarted = false; + this.hasClosed = false; + this.onStart = onStart; + this.onClose = onClose; + this.catalog = catalog; + this.streamNames = AirbyteStreamNameNamespacePair.fromConfiguredCatalog(catalog); + this.isValidRecord = isValidRecord; + this.streamToIgnoredRecordCount = new HashMap<>(); + this.bufferingStrategy = bufferingStrategy; + this.stateManager = new DefaultDestStateLifecycleManager(); + this.bufferFlushFrequency = flushFrequency; + } + + @Override + protected void startTracked() throws Exception { + // todo (cgardens) - if we reuse this pattern, consider moving it into FailureTrackingConsumer. + Preconditions.checkState(!hasStarted, "Consumer has already been started."); + hasStarted = true; + nextFlushDeadline = Instant.now().plus(bufferFlushFrequency); + streamToIgnoredRecordCount.clear(); + LOGGER.info("{} started.", BufferedStreamConsumer.class); + onStart.call(); + } + + /** + * AcceptTracked will still process AirbyteMessages as usual with the addition of periodically + * flushing buffer and writing data to destination storage + * + * @param message {@link AirbyteMessage} to be processed + * @throws Exception + */ + @Override + protected void acceptTracked(final AirbyteMessage message) throws Exception { + Preconditions.checkState(hasStarted, "Cannot accept records until consumer has started"); + if (message.getType() == Type.RECORD) { + final AirbyteRecordMessage record = message.getRecord(); + final AirbyteStreamNameNamespacePair stream = AirbyteStreamNameNamespacePair.fromRecordMessage(record); + + // if stream is not part of list of streams to sync to then throw invalid stream exception + if (!streamNames.contains(stream)) { + throwUnrecognizedStream(catalog, message); + } + + if (!isValidRecord.apply(record.getData())) { + streamToIgnoredRecordCount.put(stream, streamToIgnoredRecordCount.getOrDefault(stream, 0L) + 1L); + return; + } + + final Optional flushType = bufferingStrategy.addRecord(stream, message); + // if present means that a flush occurred + if (flushType.isPresent()) { + if (BufferFlushType.FLUSH_ALL.equals(flushType.get())) { + markStatesAsFlushedToDestination(); + } else if (BufferFlushType.FLUSH_SINGLE_STREAM.equals(flushType.get())) { + if (stateManager.supportsPerStreamFlush()) { + // per-stream instance can handle flush of just a single stream + markStatesAsFlushedToDestination(); + } + /* + * We don't mark {@link AirbyteStateMessage} as committed in the case with GLOBAL/LEGACY because + * within a single stream being flushed it is not deterministic that all the AirbyteRecordMessages + * have been committed + */ + } + } + } else if (message.getType() == Type.STATE) { + stateManager.addState(message); + } else { + LOGGER.warn("Unexpected message: " + message.getType()); + } + periodicBufferFlush(); + } + + /** + * After marking states as committed, return the state message to platform then clear state messages + * to avoid resending the same state message to the platform. Also updates the next time a buffer + * flush should occur since it is deterministic that when this method is called all data has been + * successfully committed to destination + */ + private void markStatesAsFlushedToDestination() { + stateManager.markPendingAsCommitted(); + stateManager.listCommitted().forEach(outputRecordCollector); + stateManager.clearCommitted(); + nextFlushDeadline = Instant.now().plus(bufferFlushFrequency); + } + + /** + * Periodically flushes buffered data to destination storage when exceeding flush deadline. Also + * resets the last time a flush occurred + */ + private void periodicBufferFlush() throws Exception { + // When the last time the buffered has been flushed exceed the frequency, flush the current + // buffer before receiving incoming AirbyteMessage + if (Instant.now().isAfter(nextFlushDeadline)) { + LOGGER.info("Periodic buffer flush started"); + try { + bufferingStrategy.flushAllBuffers(); + markStatesAsFlushedToDestination(); + } catch (final Exception e) { + LOGGER.error("Periodic buffer flush failed", e); + throw e; + } + } + } + + private static void throwUnrecognizedStream(final ConfiguredAirbyteCatalog catalog, final AirbyteMessage message) { + throw new IllegalArgumentException( + String.format("Message contained record from a stream that was not in the catalog. \ncatalog: %s , \nmessage: %s", + Jsons.serialize(catalog), Jsons.serialize(message))); + } + + /** + * Cleans up buffer based on whether the sync was successful or some exception occurred. In the case + * where a failure occurred we do a simple clean up any lingering data. Otherwise, flush any + * remaining data that has been stored. This is fine even if the state has not been received since + * this Airbyte promises at least once delivery + * + * @param hasFailed true if the stream replication failed partway through, false otherwise + * @throws Exception + */ + @Override + protected void close(final boolean hasFailed) throws Exception { + Preconditions.checkState(hasStarted, "Cannot close; has not started."); + Preconditions.checkState(!hasClosed, "Has already closed."); + hasClosed = true; + + streamToIgnoredRecordCount + .forEach((pair, count) -> LOGGER.warn("A total of {} record(s) of data from stream {} were invalid and were ignored.", count, pair)); + if (hasFailed) { + LOGGER.error("executing on failed close procedure."); + } else { + LOGGER.info("executing on success close procedure."); + // When flushing the buffer, this will call the respective #flushBufferFunction which bundles + // the flush and commit operation, so if successful then mark state as committed + bufferingStrategy.flushAllBuffers(); + markStatesAsFlushedToDestination(); + } + bufferingStrategy.close(); + + try { + /* + * TODO: (ryankfu) Remove usage of hasFailed with onClose after all destination connectors have been + * updated to support checkpointing + * + * flushed is empty in 2 cases: 1. either it is full refresh (no state is emitted necessarily) 2. it + * is stream but no states were flushed in both of these cases, if there was a failure, we should + * not bother committing. otherwise attempt to commit + */ + if (stateManager.listFlushed().isEmpty()) { + onClose.accept(hasFailed); + } else { + /* + * if any state message was flushed that means we should try to commit what we have. if + * hasFailed=false, then it could be full success. if hasFailed=true, then going for partial + * success. + */ + onClose.accept(false); + } + + stateManager.listCommitted().forEach(outputRecordCollector); + } catch (final Exception e) { + LOGGER.error("Close failed.", e); + throw e; + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/CheckAndRemoveRecordWriter.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/CheckAndRemoveRecordWriter.java new file mode 100644 index 0000000000000..55ed3c1a9ca3d --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/CheckAndRemoveRecordWriter.java @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; + +@FunctionalInterface +public interface CheckAndRemoveRecordWriter { + + /** + * Compares the name of the current staging file with the method argument. If the names are + * different, then the staging writer corresponding to `stagingFileName` is closed and the name of + * the new file where the record will be sent will be returned. + */ + String apply(AirbyteStreamNameNamespacePair stream, String stagingFileName) throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/OnCloseFunction.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/OnCloseFunction.java new file mode 100644 index 0000000000000..ca77fd9b12ba9 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/OnCloseFunction.java @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import io.airbyte.commons.functional.CheckedConsumer; + +public interface OnCloseFunction extends CheckedConsumer { + + @Override + void accept(Boolean hasFailed) throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/OnStartFunction.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/OnStartFunction.java new file mode 100644 index 0000000000000..ebef22ef9280a --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/OnStartFunction.java @@ -0,0 +1,11 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import io.airbyte.commons.concurrency.VoidCallable; + +public interface OnStartFunction extends VoidCallable { + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordSizeEstimator.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordSizeEstimator.java new file mode 100644 index 0000000000000..9c5949a645f24 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordSizeEstimator.java @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.util.HashMap; +import java.util.Map; + +/** + * This class estimate the byte size of the record message. To reduce memory footprint, 1) it + * assumes that a character is always four bytes, and 2) it only performs a sampling every N + * records. The size of the samples are averaged together to protect the estimation against + * outliers. + */ +public class RecordSizeEstimator { + + // by default, perform one estimation for every 20 records + private static final int DEFAULT_SAMPLE_BATCH_SIZE = 20; + + // latest estimated record message size for each stream + private final Map streamRecordSizeEstimation; + // number of record messages until next real sampling for each stream + private final Map streamSampleCountdown; + // number of record messages + private final int sampleBatchSize; + + /** + * The estimator will perform a real calculation once per sample batch. The size of the batch is + * determined by {@code sampleBatchSize}. + */ + public RecordSizeEstimator(final int sampleBatchSize) { + this.streamRecordSizeEstimation = new HashMap<>(); + this.streamSampleCountdown = new HashMap<>(); + this.sampleBatchSize = sampleBatchSize; + } + + public RecordSizeEstimator() { + this(DEFAULT_SAMPLE_BATCH_SIZE); + } + + public long getEstimatedByteSize(final AirbyteRecordMessage record) { + final String stream = record.getStream(); + final Integer countdown = streamSampleCountdown.get(stream); + + // this is a new stream; initialize its estimation + if (countdown == null) { + final long byteSize = getStringByteSize(record.getData()); + streamRecordSizeEstimation.put(stream, byteSize); + streamSampleCountdown.put(stream, sampleBatchSize - 1); + return byteSize; + } + + // this stream needs update; compute a new estimation + if (countdown <= 0) { + final long prevMeanByteSize = streamRecordSizeEstimation.get(stream); + final long currentByteSize = getStringByteSize(record.getData()); + final long newMeanByteSize = prevMeanByteSize / 2 + currentByteSize / 2; + streamRecordSizeEstimation.put(stream, newMeanByteSize); + streamSampleCountdown.put(stream, sampleBatchSize - 1); + return newMeanByteSize; + } + + // this stream does not need update; return current estimation + streamSampleCountdown.put(stream, countdown - 1); + return streamRecordSizeEstimation.get(stream); + } + + @VisibleForTesting + static long getStringByteSize(final JsonNode data) { + // assume UTF-8 encoding, and each char is 4 bytes long + return Jsons.serialize(data).length() * 4L; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordWriter.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordWriter.java new file mode 100644 index 0000000000000..fb5641d6af6bc --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordWriter.java @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import io.airbyte.commons.functional.CheckedBiConsumer; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import java.util.List; + +public interface RecordWriter extends CheckedBiConsumer, Exception> { + + @Override + void accept(AirbyteStreamNameNamespacePair stream, List records) throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/StreamDateFormatter.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/StreamDateFormatter.java new file mode 100644 index 0000000000000..c582bd05a9ea6 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/buffered_stream_consumer/StreamDateFormatter.java @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import io.airbyte.protocol.models.v0.AirbyteMessage; + +/** + * Allows specifying transformation logic from Airbyte Json to String. + */ +public interface StreamDateFormatter { + + String getFormattedDate(AirbyteMessage message); + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DefaultDestStateLifecycleManager.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DefaultDestStateLifecycleManager.java new file mode 100644 index 0000000000000..f5a85f3227818 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DefaultDestStateLifecycleManager.java @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteStateMessage.AirbyteStateType; +import java.util.Queue; +import java.util.function.Supplier; + +/** + * Detects the type of the state being received by anchoring on the first state type it sees. Fail + * if receives states of multiple types--each instance of this class can only support state messages + * of one type. The protocol specifies that a source should emit state messages of a single type + * during a sync, so a single instance of this manager is sufficient for a destination to track + * state during a sync. + * + *

+ * Strategy: Delegates state messages of each type to a StateManager that is appropriate to that + * state type. + *

+ * + *

+ * Per the protocol, if state type is not set, assumes the LEGACY state type. + *

+ */ +public class DefaultDestStateLifecycleManager implements DestStateLifecycleManager { + + private AirbyteStateType stateType; + private final Supplier internalStateManagerSupplier; + + public DefaultDestStateLifecycleManager() { + this(new DestSingleStateLifecycleManager(), new DestStreamStateLifecycleManager()); + } + + @VisibleForTesting + DefaultDestStateLifecycleManager(final DestStateLifecycleManager singleStateManager, final DestStateLifecycleManager streamStateManager) { + stateType = null; + // allows us to delegate calls to the appropriate underlying state manager. + internalStateManagerSupplier = () -> { + if (stateType == AirbyteStateType.GLOBAL || stateType == AirbyteStateType.LEGACY || stateType == null) { + return singleStateManager; + } else if (stateType == AirbyteStateType.STREAM) { + return streamStateManager; + } else { + throw new IllegalArgumentException("unrecognized state type"); + } + }; + } + + @Override + public void addState(final AirbyteMessage message) { + Preconditions.checkArgument(message.getType() == Type.STATE, "Messages passed to State Manager must be of type STATE."); + Preconditions.checkArgument(isStateTypeCompatible(stateType, message.getState().getType())); + + setManagerStateTypeIfNotSet(message); + + internalStateManagerSupplier.get().addState(message); + } + + /** + * Given the type of previously recorded state by the state manager, determines if a newly added + * state message's type is compatible. Based on the previously set state type, determines if a new + * one is compatible. If the previous state is null, any new state is compatible. If new state type + * is null, it should be treated as LEGACY. Thus, previousStateType == LEGACY and newStateType == + * null IS compatible. All other state types are compatible based on equality. + * + * @param previousStateType - state type previously recorded by the state manager + * @param newStateType - state message of a newly added message + * @return true if compatible, otherwise false + */ + private static boolean isStateTypeCompatible(final AirbyteStateType previousStateType, final AirbyteStateType newStateType) { + return previousStateType == null || previousStateType == AirbyteStateType.LEGACY && newStateType == null || previousStateType == newStateType; + } + + /** + * If the state type for the manager is not set, sets it using the state type from the message. If + * the type on the message is null, we assume it is LEGACY. After the first, state message is added + * to the manager, the state type is set and is immutable. + * + * @param message - state message whose state will be used if internal state type is not set + */ + private void setManagerStateTypeIfNotSet(final AirbyteMessage message) { + // detect and set state type. + if (stateType == null) { + if (message.getState().getType() == null) { + stateType = AirbyteStateType.LEGACY; + } else { + stateType = message.getState().getType(); + } + } + } + + @Override + public void markPendingAsFlushed() { + internalStateManagerSupplier.get().markPendingAsFlushed(); + } + + @Override + public Queue listFlushed() { + return internalStateManagerSupplier.get().listFlushed(); + } + + @Override + public void markFlushedAsCommitted() { + internalStateManagerSupplier.get().markFlushedAsCommitted(); + } + + @Override + public void markPendingAsCommitted() { + internalStateManagerSupplier.get().markPendingAsCommitted(); + } + + @Override + public void clearCommitted() { + internalStateManagerSupplier.get().clearCommitted(); + } + + @Override + public Queue listCommitted() { + return internalStateManagerSupplier.get().listCommitted(); + } + + @Override + public boolean supportsPerStreamFlush() { + return internalStateManagerSupplier.get().supportsPerStreamFlush(); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestSingleStateLifecycleManager.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestSingleStateLifecycleManager.java new file mode 100644 index 0000000000000..b5fbadef03cc4 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestSingleStateLifecycleManager.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; + +/** + * This {@link DestStateLifecycleManager} handles any state where there is a guarantee that any + * single state message represents the state for the ENTIRE connection. At the time of writing, + * GLOBAL and LEGACY state types are the state type that match this pattern. + * + *

+ * Does NOT store duplicates. Because each state message represents the entire state for the + * connection, it only stores (and emits) the LAST state it received at each phase. + *

+ */ +public class DestSingleStateLifecycleManager implements DestStateLifecycleManager { + + private AirbyteMessage lastPendingState; + private AirbyteMessage lastFlushedState; + private AirbyteMessage lastCommittedState; + + @Override + public void addState(final AirbyteMessage message) { + lastPendingState = message; + } + + @VisibleForTesting + Queue listPending() { + return stateMessageToQueue(lastPendingState); + } + + @Override + public void markPendingAsFlushed() { + if (lastPendingState != null) { + lastFlushedState = lastPendingState; + lastPendingState = null; + } + } + + @Override + public Queue listFlushed() { + return stateMessageToQueue(lastFlushedState); + } + + @Override + public void markFlushedAsCommitted() { + if (lastFlushedState != null) { + lastCommittedState = lastFlushedState; + lastFlushedState = null; + } + } + + @Override + public void clearCommitted() { + lastCommittedState = null; + } + + @Override + public void markPendingAsCommitted() { + if (lastPendingState != null) { + lastCommittedState = lastPendingState; + lastPendingState = null; + } + } + + @Override + public Queue listCommitted() { + return stateMessageToQueue(lastCommittedState); + } + + private static Queue stateMessageToQueue(final AirbyteMessage stateMessage) { + return new LinkedList<>(stateMessage == null ? Collections.emptyList() : List.of(stateMessage)); + } + + @Override + public boolean supportsPerStreamFlush() { + return false; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStateLifecycleManager.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStateLifecycleManager.java new file mode 100644 index 0000000000000..ebe9b4516408f --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStateLifecycleManager.java @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import io.airbyte.protocol.models.v0.AirbyteMessage; +import java.util.Queue; + +/** + * This class manages the lifecycle of state message. It tracks state messages that are in 3 states: + *
    + *
  1. pending - associated records have been accepted by the connector but has NOT been pushed to + * the destination
  2. + *
  3. flushed - associated records have been flushed to tmp storage in the destination but have NOT + * been committed
  4. + *
  5. committed - associated records have been committed
  6. + *
+ * + */ +public interface DestStateLifecycleManager { + + /** + * Accepts a state into the manager. The state starts in a pending state. + * + * @param message - airbyte message of type state + */ + void addState(AirbyteMessage message); + + /** + * Moves any tracked state messages that are currently pending to flushed. + * + * @Deprecated since destination checkpointing will be bundling flush & commit into the same + * operation + */ + void markPendingAsFlushed(); + + /** + * List all tracked state messages that are flushed. + * + * @return list of state messages + */ + Queue listFlushed(); + + /** + * Moves any tracked state messages that are currently flushed to committed. + * + * @Deprecated since destination checkpointing will be bundling flush and commit into the same + * operation + */ + void markFlushedAsCommitted(); + + /** + * Clears any committed state messages, this is called after returning the state message to the + * platform. The rationale behind this logic is to avoid returning duplicated state messages that + * would otherwise be held in the `committed` state + */ + void clearCommitted(); + + /** + * Moves any tracked state messages that are currently pending to committed. + * + * Note: that this is skipping "flushed" state since flushed meant that this was using a staging + * area to hold onto files, for the changes with checkpointing this step is skipped. It follows + * under the guiding principle that destination needs to commit + * {@link io.airbyte.protocol.models.AirbyteRecordMessage} more frequently to checkpoint. The new + * transaction logic will be: + * + * Buffer -(flush)-> Staging (Blob Storage) -(commit to airbyte_raw)-> Destination table + */ + void markPendingAsCommitted(); + + /** + * List all tracked state messages that are committed. + * + * @return list of state messages + */ + Queue listCommitted(); + + boolean supportsPerStreamFlush(); + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStreamStateLifecycleManager.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStreamStateLifecycleManager.java new file mode 100644 index 0000000000000..3d69907af14e4 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStreamStateLifecycleManager.java @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteStateMessage.AirbyteStateType; +import io.airbyte.protocol.models.v0.StreamDescriptor; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Queue; +import java.util.stream.Collectors; + +/** + * This {@link DestStateLifecycleManager} handles any state where the state messages are scoped by + * stream. In these cases, at each state of the process, it tracks the LAST state message for EACH + * stream (no duplicates!). + * + *

+ * Guaranteed to output state messages in order relative to other messages of the SAME state. Does + * NOT guarantee that state messages of different streams will be output in the order in which they + * were received. State messages across streams will be emitted in alphabetical order (primary sort + * on namespace, secondary on name). + *

+ */ +public class DestStreamStateLifecycleManager implements DestStateLifecycleManager { + + private final Map streamToLastPendingState; + private final Map streamToLastFlushedState; + private final Map streamToLastCommittedState; + + public DestStreamStateLifecycleManager() { + streamToLastPendingState = new HashMap<>(); + streamToLastFlushedState = new HashMap<>(); + streamToLastCommittedState = new HashMap<>(); + } + + @Override + public void addState(final AirbyteMessage message) { + Preconditions.checkArgument(message.getState().getType() == AirbyteStateType.STREAM); + streamToLastPendingState.put(message.getState().getStream().getStreamDescriptor(), message); + } + + @VisibleForTesting + Queue listPending() { + return listStatesInOrder(streamToLastPendingState); + } + + /* + * Similar to #markFlushedAsCommmitted, this method should no longer be used to align with the + * changes to destination checkpointing where flush/commit operations will be bundled + */ + @Deprecated + @Override + public void markPendingAsFlushed() { + moveToNextPhase(streamToLastPendingState, streamToLastFlushedState); + } + + @Override + public Queue listFlushed() { + return listStatesInOrder(streamToLastFlushedState); + } + + /* + * During the process of migration to destination checkpointing, this method should no longer be in + * use in favor of #markPendingAsCommitted where states will be flushed/committed as a singular + * transaction + */ + @Deprecated + @Override + public void markFlushedAsCommitted() { + moveToNextPhase(streamToLastFlushedState, streamToLastCommittedState); + } + + @Override + public void clearCommitted() { + streamToLastCommittedState.clear(); + } + + @Override + public void markPendingAsCommitted() { + moveToNextPhase(streamToLastPendingState, streamToLastCommittedState); + } + + @Override + public Queue listCommitted() { + return listStatesInOrder(streamToLastCommittedState); + } + + @Override + public boolean supportsPerStreamFlush() { + return true; + } + + /** + * Lists out the states in the stream to state maps. Guarantees a deterministic sort order, which is + * handy because we are going from a map (unsorted) to a queue. The sort order primary sort on + * namespace (with null at the top) followed by secondary sort on name. This maps onto the pretty + * common order that we list streams elsewhere. + * + * @param streamToState - map of stream descriptor to its last state + * @return queue with the states ordered per the sort mentioned above + */ + private static Queue listStatesInOrder(final Map streamToState) { + return streamToState + .entrySet() + .stream() + // typically, we support by namespace and then stream name, so we retain that pattern here. + .sorted(Comparator + ., String>comparing( + entry -> entry.getKey().getNamespace(), + Comparator.nullsFirst(Comparator.naturalOrder())) // namespace is allowed to be null + .thenComparing(entry -> entry.getKey().getName())) + .map(Entry::getValue) + .collect(Collectors.toCollection(LinkedList::new)); + } + + /** + * Moves all state messages from previous phase into next phase. + * + * @param prevPhase - map of stream to state messages for previous phase that will be moved to next + * phase. when this method returns this map will be empty. + * @param nextPhase - map into which state messages from prevPhase will be added. + */ + private static void moveToNextPhase(final Map prevPhase, final Map nextPhase) { + if (!prevPhase.isEmpty()) { + nextPhase.putAll(prevPhase); + prevPhase.clear(); + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/normalization/NormalizationLogParser.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/normalization/NormalizationLogParser.java new file mode 100644 index 0000000000000..73a1411b72ec9 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/normalization/NormalizationLogParser.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.normalization; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.destination.normalization.SentryExceptionHelper.ErrorMapKeys; +import io.airbyte.protocol.models.AirbyteErrorTraceMessage; +import io.airbyte.protocol.models.AirbyteErrorTraceMessage.FailureType; +import io.airbyte.protocol.models.AirbyteLogMessage; +import io.airbyte.protocol.models.AirbyteLogMessage.Level; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteMessage.Type; +import io.airbyte.protocol.models.AirbyteTraceMessage; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Stream; +import org.apache.logging.log4j.util.Strings; + +/** + * A simple wrapper for base-normalization logs. Reads messages off of stdin and sticks them into + * appropriate AirbyteMessages (log or trace), then dumps those messages to stdout + *

+ * does mostly the same thing as + * {@link io.airbyte.workers.normalization.NormalizationAirbyteStreamFactory}. That class is not + * actively developed, and will be deleted after all destinations run normalization in-connector. + *

+ * Aggregates all error logs and emits them as a single trace message at the end. If the underlying + * process emits any trace messages, they are passed through immediately. + */ +public class NormalizationLogParser { + + private final List dbtErrors = new ArrayList<>(); + + public Stream create(final BufferedReader bufferedReader) { + return bufferedReader.lines().flatMap(this::toMessages); + } + + public List getDbtErrors() { + return dbtErrors; + } + + @VisibleForTesting + Stream toMessages(final String line) { + if (Strings.isEmpty(line)) { + return Stream.of(logMessage(Level.INFO, "")); + } + final Optional json = Jsons.tryDeserialize(line); + if (json.isPresent()) { + return jsonToMessage(json.get()); + } else { + return nonJsonLineToMessage(line); + } + } + + /** + * Wrap the line in an AirbyteLogMessage, and do very naive dbt error log detection. + *

+ * This is needed for dbt < 1.0.0, which don't support json-format logs. + */ + private Stream nonJsonLineToMessage(final String line) { + // Super hacky thing to try and detect error lines + if (line.contains("[error]")) { + dbtErrors.add(line); + } + return Stream.of(logMessage(Level.INFO, line)); + } + + /** + * There are two cases here: Either the json is already an AirbyteMessage (and we should just emit + * it without change), or it's dbt json log, and we need to do some extra work to convert it to a + * log message + aggregate error logs. + */ + private Stream jsonToMessage(final JsonNode jsonLine) { + final Optional message = Jsons.tryObject(jsonLine, AirbyteMessage.class); + if (message.isPresent()) { + // This line is already an AirbyteMessage; we can just return it directly + // (these messages come from the transform_config / transform_catalog scripts) + return message.stream(); + } else { + /* + * This line is a JSON-format dbt log. We need to extract the message and wrap it in a logmessage + * And if it's an error, we also need to collect it into dbtErrors. Example log message, formatted + * for readability: { "code": "A001", "data": { "v": "=1.0.9" }, "invocation_id": + * "3f9a0b9f-9623-4c25-8708-1f6ae851e738", "level": "info", "log_version": 1, "msg": + * "Running with dbt=1.0.9", "node_info": {}, "pid": 65, "thread_name": "MainThread", "ts": + * "2023-04-12T21:03:23.079315Z", "type": "log_line" } + */ + final String logLevel = (jsonLine.hasNonNull("level")) ? jsonLine.get("level").asText() : ""; + String logMsg = jsonLine.hasNonNull("msg") ? jsonLine.get("msg").asText() : ""; + Level level; + switch (logLevel) { + case "debug" -> level = Level.DEBUG; + case "info" -> level = Level.INFO; + case "warn" -> level = Level.WARN; + case "error" -> { + // This is also not _amazing_, but we make the assumption that all error logs should be emitted in + // the trace message + // In practice, this seems to be a valid assumption. + level = Level.ERROR; + dbtErrors.add(logMsg); + } + default -> { + level = Level.INFO; + logMsg = jsonLine.toPrettyString(); + } + } + return Stream.of(logMessage(level, logMsg)); + } + } + + private static AirbyteMessage logMessage(Level level, String message) { + return new AirbyteMessage() + .withType(Type.LOG) + .withLog(new AirbyteLogMessage() + .withLevel(level) + .withMessage(message)); + } + + public static void main(String[] args) { + final NormalizationLogParser normalizationLogParser = new NormalizationLogParser(); + final Stream airbyteMessageStream = + normalizationLogParser.create(new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8))); + airbyteMessageStream.forEachOrdered(message -> System.out.println(Jsons.serialize(message))); + + final List errors = normalizationLogParser.getDbtErrors(); + final String dbtErrorStack = String.join("\n", errors); + if (!"".equals(dbtErrorStack)) { + final Map errorMap = SentryExceptionHelper.getUsefulErrorMessageAndTypeFromDbtError(dbtErrorStack); + String internalMessage = errorMap.get(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY); + AirbyteMessage traceMessage = new AirbyteMessage() + .withType(Type.TRACE) + .withTrace(new AirbyteTraceMessage() + .withType(AirbyteTraceMessage.Type.ERROR) + .withEmittedAt((double) System.currentTimeMillis()) + .withError(new AirbyteErrorTraceMessage() + .withFailureType(FailureType.SYSTEM_ERROR) + .withMessage("Normalization failed during the dbt run. This may indicate a problem with the data itself.") + .withStackTrace("AirbyteDbtError: \n" + dbtErrorStack) + .withInternalMessage(internalMessage))); + System.out.println(Jsons.serialize(traceMessage)); + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/normalization/SentryExceptionHelper.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/normalization/SentryExceptionHelper.java new file mode 100644 index 0000000000000..3f604e568e1cd --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/normalization/SentryExceptionHelper.java @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.normalization; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is copied out of platform + * (https://github.com/airbytehq/airbyte-platform/blob/main/airbyte-persistence/job-persistence/src/main/java/io/airbyte/persistence/job/errorreporter/SentryExceptionHelper.java#L257) + */ +public class SentryExceptionHelper { + + private static final Logger LOGGER = LoggerFactory.getLogger(SentryExceptionHelper.class); + + /** + * Keys to known error types. + */ + public enum ErrorMapKeys { + ERROR_MAP_MESSAGE_KEY, + ERROR_MAP_TYPE_KEY + } + + public static Map getUsefulErrorMessageAndTypeFromDbtError(final String stacktrace) { + // the dbt 'stacktrace' is really just all the log messages at 'error' level, stuck together. + // therefore there is not a totally consistent structure to these, + // see the docs: https://docs.getdbt.com/guides/legacy/debugging-errors + // the logic below is built based on the ~450 unique dbt errors we encountered before this PR + // and is a best effort to isolate the useful part of the error logs for debugging and grouping + // and bring some semblance of exception 'types' to differentiate between errors. + final Map errorMessageAndType = new HashMap<>(); + final String[] stacktraceLines = stacktrace.split("\n"); + + boolean defaultNextLine = false; + // TODO: this whole code block is quite ugh, commented to try and make each part clear but could be + // much more readable. + mainLoop: for (int i = 0; i < stacktraceLines.length; i++) { + // This order is important due to how these errors can co-occur. + // This order attempts to keep error definitions consistent based on our observations of possible + // dbt error structures. + try { + // Database Errors + if (stacktraceLines[i].contains("Database Error in model")) { + // Database Error : SQL compilation error + if (stacktraceLines[i + 1].contains("SQL compilation error")) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, + String.format("%s %s", stacktraceLines[i + 1].trim(), stacktraceLines[i + 2].trim())); + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtDatabaseSQLCompilationError"); + break; + // Database Error: Invalid input + } else if (stacktraceLines[i + 1].contains("Invalid input")) { + for (final String followingLine : Arrays.copyOfRange(stacktraceLines, i + 1, stacktraceLines.length)) { + if (followingLine.trim().startsWith("context:")) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, + String.format("%s\n%s", stacktraceLines[i + 1].trim(), followingLine.trim())); + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtDatabaseInvalidInputError"); + break mainLoop; + } + } + // Database Error: Syntax error + } else if (stacktraceLines[i + 1].contains("syntax error at or near \"")) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, + String.format("%s\n%s", stacktraceLines[i + 1].trim(), stacktraceLines[i + 2].trim())); + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtDatabaseSyntaxError"); + break; + // Database Error: default + } else { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtDatabaseError"); + defaultNextLine = true; + } + // Unhandled Error + } else if (stacktraceLines[i].contains("Unhandled error while executing model")) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtUnhandledError"); + defaultNextLine = true; + // Compilation Errors + } else if (stacktraceLines[i].contains("Compilation Error")) { + // Compilation Error: Ambiguous Relation + if (stacktraceLines[i + 1].contains("When searching for a relation, dbt found an approximate match.")) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, + String.format("%s %s", stacktraceLines[i + 1].trim(), stacktraceLines[i + 2].trim())); + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtCompilationAmbiguousRelationError"); + break; + // Compilation Error: default + } else { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtCompilationError"); + defaultNextLine = true; + } + // Runtime Errors + } else if (stacktraceLines[i].contains("Runtime Error")) { + // Runtime Error: Database error + for (final String followingLine : Arrays.copyOfRange(stacktraceLines, i + 1, stacktraceLines.length)) { + if ("Database Error".equals(followingLine.trim())) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, + String.format("%s", stacktraceLines[Arrays.stream(stacktraceLines).toList().indexOf(followingLine) + 1].trim())); + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtRuntimeDatabaseError"); + break mainLoop; + } + } + // Runtime Error: default + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtRuntimeError"); + defaultNextLine = true; + // Database Error: formatted differently, catch last to avoid counting other types of errors as + // Database Error + } else if ("Database Error".equals(stacktraceLines[i].trim())) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "DbtDatabaseError"); + defaultNextLine = true; + } + // handle the default case without repeating code + if (defaultNextLine) { + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, stacktraceLines[i + 1].trim()); + break; + } + } catch (final ArrayIndexOutOfBoundsException e) { + // this means our logic is slightly off, our assumption of where error lines are is incorrect + LOGGER.warn("Failed trying to parse useful error message out of dbt error, defaulting to full stacktrace"); + } + } + if (errorMessageAndType.isEmpty()) { + // For anything we haven't caught, just return full stacktrace + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_MESSAGE_KEY, stacktrace); + errorMessageAndType.put(ErrorMapKeys.ERROR_MAP_TYPE_KEY, "AirbyteDbtError"); + } + return errorMessageAndType; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BaseSerializedBuffer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BaseSerializedBuffer.java new file mode 100644 index 0000000000000..9d6ce6acc976e --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BaseSerializedBuffer.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import com.google.common.io.CountingOutputStream; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base implementation of a {@link SerializableBuffer}. It is composed of a {@link BufferStorage} + * where the actual data is being stored in a serialized format. + * + * Such data format is defined by concrete implementation inheriting from this base abstract class. + * To do so, necessary methods on handling "writer" methods should be defined. This writer would + * take care of converting {@link AirbyteRecordMessage} into the serialized form of the data such as + * it can be stored in the outputStream of the {@link BufferStorage}. + */ +public abstract class BaseSerializedBuffer implements SerializableBuffer { + + private static final Logger LOGGER = LoggerFactory.getLogger(BaseSerializedBuffer.class); + private static final String GZ_SUFFIX = ".gz"; + + private final BufferStorage bufferStorage; + private final CountingOutputStream byteCounter; + + private boolean useCompression; + private GzipCompressorOutputStream compressedBuffer; + private InputStream inputStream; + private boolean isStarted; + private boolean isClosed; + + protected BaseSerializedBuffer(final BufferStorage bufferStorage) throws Exception { + this.bufferStorage = bufferStorage; + byteCounter = new CountingOutputStream(bufferStorage.getOutputStream()); + useCompression = true; + compressedBuffer = null; + inputStream = null; + isStarted = false; + isClosed = false; + } + + /** + * Initializes the writer objects such that it can now write to the downstream @param outputStream + */ + protected abstract void initWriter(OutputStream outputStream) throws Exception; + + /** + * Transform the @param record into a serialized form of the data and writes it to the registered + * OutputStream provided when {@link BaseSerializedBuffer#initWriter} was called. + */ + protected abstract void writeRecord(AirbyteRecordMessage record) throws IOException; + + /** + * Stops the writer from receiving new data and prepares it for being finalized and converted into + * an InputStream to read from instead. This is used when flushing the buffer into some other + * destination. + */ + protected abstract void flushWriter() throws IOException; + + protected abstract void closeWriter() throws IOException; + + public SerializableBuffer withCompression(final boolean useCompression) { + if (!isStarted) { + this.useCompression = useCompression; + return this; + } + throw new RuntimeException("Options should be configured before starting to write"); + } + + @Override + public long accept(final AirbyteRecordMessage record) throws Exception { + if (!isStarted) { + if (useCompression) { + compressedBuffer = new GzipCompressorOutputStream(byteCounter); + initWriter(compressedBuffer); + } else { + initWriter(byteCounter); + } + isStarted = true; + } + if (inputStream == null && !isClosed) { + final long startCount = byteCounter.getCount(); + writeRecord(record); + return byteCounter.getCount() - startCount; + } else { + throw new IllegalCallerException("Buffer is already closed, it cannot accept more messages"); + } + } + + @Override + public String getFilename() throws IOException { + if (useCompression && !bufferStorage.getFilename().endsWith(GZ_SUFFIX)) { + return bufferStorage.getFilename() + GZ_SUFFIX; + } + return bufferStorage.getFilename(); + } + + @Override + public File getFile() throws IOException { + if (useCompression && !bufferStorage.getFilename().endsWith(GZ_SUFFIX)) { + if (bufferStorage.getFile().renameTo(new File(bufferStorage.getFilename() + GZ_SUFFIX))) { + LOGGER.info("Renaming compressed file to include .gz file extension"); + } + } + return bufferStorage.getFile(); + } + + protected InputStream convertToInputStream() throws IOException { + return bufferStorage.convertToInputStream(); + } + + @Override + public InputStream getInputStream() { + return inputStream; + } + + @Override + public void flush() throws IOException { + if (inputStream == null && !isClosed) { + flushWriter(); + if (compressedBuffer != null) { + LOGGER.info("Wrapping up compression and write GZIP trailer data."); + compressedBuffer.flush(); + compressedBuffer.close(); + } + closeWriter(); + bufferStorage.close(); + inputStream = convertToInputStream(); + LOGGER.info("Finished writing data to {} ({})", getFilename(), FileUtils.byteCountToDisplaySize(byteCounter.getCount())); + } + } + + @Override + public long getByteCount() { + return byteCounter.getCount(); + } + + @Override + public void close() throws Exception { + if (!isClosed) { + // inputStream can be null if the accept method encounters + // an error before inputStream is initialized + if (inputStream != null) { + inputStream.close(); + } + bufferStorage.deleteFile(); + isClosed = true; + } + } + + @Override + public long getMaxTotalBufferSizeInBytes() { + return bufferStorage.getMaxTotalBufferSizeInBytes(); + } + + @Override + public long getMaxPerStreamBufferSizeInBytes() { + return bufferStorage.getMaxPerStreamBufferSizeInBytes(); + } + + @Override + public int getMaxConcurrentStreamsInBuffer() { + return bufferStorage.getMaxConcurrentStreamsInBuffer(); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferCreateFunction.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferCreateFunction.java new file mode 100644 index 0000000000000..bda03460ff0b2 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferCreateFunction.java @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import io.airbyte.commons.functional.CheckedBiFunction; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; + +public interface BufferCreateFunction extends + CheckedBiFunction { + + @Override + SerializableBuffer apply(AirbyteStreamNameNamespacePair stream, ConfiguredAirbyteCatalog configuredCatalog) + throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferFlushType.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferFlushType.java new file mode 100644 index 0000000000000..3d2a85b77f968 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferFlushType.java @@ -0,0 +1,10 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +public enum BufferFlushType { + FLUSH_ALL, + FLUSH_SINGLE_STREAM +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferStorage.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferStorage.java new file mode 100644 index 0000000000000..c77329cf41f43 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferStorage.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * This interface abstract the actual object that is used to store incoming data being buffered. It + * could be a file, in-memory or some other objects. + * + * However, in order to be used as part of the {@link SerializableBuffer}, this + * {@link BufferStorage} should implement some methods used to determine how to write into and read + * from the storage once we are done buffering + * + * Some easy methods for manipulating the storage viewed as a file or InputStream are therefore + * required. + * + * Depending on the implementation of the storage medium, it would also determine what storage + * limits are possible. + */ +public interface BufferStorage { + + /** + * Builds a new outputStream on which to write the data for storage. + */ + OutputStream getOutputStream() throws IOException; + + String getFilename() throws IOException; + + File getFile() throws IOException; + + /** + * Once buffering has reached some limits, the storage stream should be turned into an InputStream. + * This method should assume we are not going to write to buffer anymore, and it is safe to convert + * to some other format to be read from now. + */ + InputStream convertToInputStream() throws IOException; + + void close() throws IOException; + + /** + * Cleans-up any file that was produced in the process of buffering (if any were produced) + */ + void deleteFile() throws IOException; + + /* + * Depending on the implementation of the storage, methods below defined reasonable thresholds + * associated with using this kind of buffer storage. + * + * These could also be dynamically configured/tuned at runtime if needed (from user input for + * example?) + */ + + /** + * @return How much storage should be used overall by all buffers + */ + long getMaxTotalBufferSizeInBytes(); + + /** + * @return How much storage should be used for a particular stream at a time before flushing it + */ + long getMaxPerStreamBufferSizeInBytes(); + + /** + * @return How many concurrent buffers can be handled at once in parallel + */ + int getMaxConcurrentStreamsInBuffer(); + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferingStrategy.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferingStrategy.java new file mode 100644 index 0000000000000..0763e6f5add1a --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/BufferingStrategy.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import java.util.Optional; + +/** + * High-level interface used by + * {@link io.airbyte.integrations.destination.buffered_stream_consumer.BufferedStreamConsumer} + * + * A Record buffering strategy relies on the capacity available of underlying + * {@link SerializableBuffer} to determine what to do when consuming a new {@link AirbyteMessage} + * into the buffer. It also defines when to flush such buffers and how to empty them once they fill + * up. + * + */ +public interface BufferingStrategy extends AutoCloseable { + + /** + * Add a new message to the buffer while consuming streams, also handles when a buffer flush when + * buffer has been filled + * + * @param stream stream associated with record + * @param message {@link AirbyteMessage} to be added to the buffer + * @return an optional value if a flushed occur with the respective flush type, otherwise an empty + * value means only a record was added + * @throws Exception throw on failure + */ + Optional addRecord(AirbyteStreamNameNamespacePair stream, AirbyteMessage message) throws Exception; + + /** + * Flush buffered messages in a buffer from a particular stream + */ + void flushSingleBuffer(AirbyteStreamNameNamespacePair stream, SerializableBuffer buffer) throws Exception; + + /** + * Flush all buffers that were buffering message data so far. + */ + void flushAllBuffers() throws Exception; + + /** + * Removes all stream buffers. + */ + void clear() throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/FileBuffer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/FileBuffer.java new file mode 100644 index 0000000000000..029877629beff --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/FileBuffer.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.util.UUID; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FileBuffer implements BufferStorage { + + private static final Logger LOGGER = LoggerFactory.getLogger(FileBuffer.class); + + // The per stream size limit is following recommendations from: + // https://docs.snowflake.com/en/user-guide/data-load-considerations-prepare.html#general-file-sizing-recommendations + // "To optimize the number of parallel operations for a load, + // we recommend aiming to produce data files roughly 100-250 MB (or larger) in size compressed." + public static final long MAX_PER_STREAM_BUFFER_SIZE_BYTES = 200 * 1024 * 1024; // 200 MB + /* + * Other than the per-file size limit, we also limit the total size (which would limit how many + * concurrent streams we can buffer simultaneously too) Since this class is storing data on disk, + * the buffer size limits below are tied to the necessary disk storage space. + */ + public static final long MAX_TOTAL_BUFFER_SIZE_BYTES = 1024 * 1024 * 1024; // 1 GB + /* + * We limit number of stream being buffered simultaneously anyway (limit how many files are + * stored/open for writing) + * + * Note: This value can be tuned to increase performance with the tradeoff of increased memory usage + * (~31 MB per buffer). See {@link StreamTransferManager} + * + * For connections with interleaved data (e.g. Change Data Capture), having less buffers than the + * number of streams being synced will cause buffer thrashing where buffers will need to be flushed + * before another stream's buffer can be created. Increasing the default max will reduce likelihood + * of thrashing but not entirely eliminate unless number of buffers equals streams to be synced + */ + public static final int DEFAULT_MAX_CONCURRENT_STREAM_IN_BUFFER = 10; + public static final String FILE_BUFFER_COUNT_KEY = "file_buffer_count"; + // This max is subject to change as no proper load testing has been done to verify the side effects + public static final int MAX_CONCURRENT_STREAM_IN_BUFFER = 50; + /* + * Use this soft cap as a guidance for customers to not exceed the recommended number of buffers + * which is 1 GB (total buffer size) / 31 MB (rough size of each buffer) ~= 32 buffers + */ + public static final int SOFT_CAP_CONCURRENT_STREAM_IN_BUFFER = 20; + + private final String fileExtension; + private File tempFile; + private OutputStream outputStream; + private final int maxConcurrentStreams; + + public FileBuffer(final String fileExtension) { + this.fileExtension = fileExtension; + this.maxConcurrentStreams = DEFAULT_MAX_CONCURRENT_STREAM_IN_BUFFER; + tempFile = null; + outputStream = null; + } + + public FileBuffer(final String fileExtension, final int maxConcurrentStreams) { + this.fileExtension = fileExtension; + this.maxConcurrentStreams = maxConcurrentStreams; + tempFile = null; + outputStream = null; + } + + @Override + public OutputStream getOutputStream() throws IOException { + if (outputStream == null || tempFile == null) { + tempFile = Files.createTempFile(UUID.randomUUID().toString(), fileExtension).toFile(); + outputStream = new BufferedOutputStream(new FileOutputStream(tempFile)); + } + return outputStream; + } + + @Override + public String getFilename() throws IOException { + return getFile().getName(); + } + + @Override + public File getFile() throws IOException { + if (tempFile == null) { + getOutputStream(); + } + return tempFile; + } + + @Override + public InputStream convertToInputStream() throws IOException { + return new FileInputStream(getFile()); + } + + @Override + public void close() throws IOException { + outputStream.close(); + } + + @Override + public void deleteFile() throws IOException { + LOGGER.info("Deleting tempFile data {}", getFilename()); + Files.deleteIfExists(getFile().toPath()); + } + + @Override + public long getMaxTotalBufferSizeInBytes() { + return MAX_TOTAL_BUFFER_SIZE_BYTES; + } + + @Override + public long getMaxPerStreamBufferSizeInBytes() { + return MAX_PER_STREAM_BUFFER_SIZE_BYTES; + } + + @Override + public int getMaxConcurrentStreamsInBuffer() { + return maxConcurrentStreams; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/FlushBufferFunction.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/FlushBufferFunction.java new file mode 100644 index 0000000000000..be43b75c55916 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/FlushBufferFunction.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import io.airbyte.commons.functional.CheckedBiConsumer; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; + +public interface FlushBufferFunction extends CheckedBiConsumer { + + @Override + void accept(AirbyteStreamNameNamespacePair stream, SerializableBuffer buffer) throws Exception; + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/InMemoryBuffer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/InMemoryBuffer.java new file mode 100644 index 0000000000000..d94a73dfd07e5 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/InMemoryBuffer.java @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.util.UUID; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Instead of storing buffered data on disk like the {@link FileBuffer}, this {@link BufferStorage} + * accumulates message data in-memory instead. Thus, a bigger heap size would be required. + */ +public class InMemoryBuffer implements BufferStorage { + + private static final Logger LOGGER = LoggerFactory.getLogger(InMemoryBuffer.class); + + // The per stream size limit is following recommendations from: + // https://docs.snowflake.com/en/user-guide/data-load-considerations-prepare.html#general-file-sizing-recommendations + // "To optimize the number of parallel operations for a load, + // we recommend aiming to produce data files roughly 100-250 MB (or larger) in size compressed." + public static final long MAX_PER_STREAM_BUFFER_SIZE_BYTES = 200 * 1024 * 1024; // 200 MB + // Other than the per-file size limit, we also limit the total size (which would limit how many + // concurrent streams we can buffer simultaneously too) + // Since this class is storing data in memory, the buffer size limits below are tied to the + // necessary RAM space. + public static final long MAX_TOTAL_BUFFER_SIZE_BYTES = 1024 * 1024 * 1024; // 1 GB + // we limit number of stream being buffered simultaneously anyway + public static final int MAX_CONCURRENT_STREAM_IN_BUFFER = 100; + + private final String fileExtension; + private final ByteArrayOutputStream byteBuffer = new ByteArrayOutputStream(); + private File tempFile; + private String filename; + + public InMemoryBuffer(final String fileExtension) { + this.fileExtension = fileExtension; + tempFile = null; + filename = null; + } + + @Override + public OutputStream getOutputStream() { + return byteBuffer; + } + + @Override + public String getFilename() { + if (filename == null) { + filename = UUID.randomUUID().toString(); + } + return filename; + } + + @Override + public File getFile() throws IOException { + if (tempFile == null) { + tempFile = Files.createTempFile(getFilename(), fileExtension).toFile(); + } + return tempFile; + } + + @Override + public InputStream convertToInputStream() { + return new ByteArrayInputStream(byteBuffer.toByteArray()); + } + + @Override + public void close() throws IOException { + byteBuffer.close(); + } + + @Override + public void deleteFile() throws IOException { + if (tempFile != null) { + LOGGER.info("Deleting tempFile data {}", getFilename()); + Files.deleteIfExists(tempFile.toPath()); + } + } + + @Override + public long getMaxTotalBufferSizeInBytes() { + return MAX_TOTAL_BUFFER_SIZE_BYTES; + } + + @Override + public long getMaxPerStreamBufferSizeInBytes() { + return MAX_PER_STREAM_BUFFER_SIZE_BYTES; + } + + @Override + public int getMaxConcurrentStreamsInBuffer() { + return MAX_CONCURRENT_STREAM_IN_BUFFER; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/InMemoryRecordBufferingStrategy.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/InMemoryRecordBufferingStrategy.java new file mode 100644 index 0000000000000..d16ef8dca1e1b --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/InMemoryRecordBufferingStrategy.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import io.airbyte.integrations.destination.buffered_stream_consumer.CheckAndRemoveRecordWriter; +import io.airbyte.integrations.destination.buffered_stream_consumer.RecordSizeEstimator; +import io.airbyte.integrations.destination.buffered_stream_consumer.RecordWriter; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is the default implementation of a {@link BufferStorage} to be backward compatible. Data is + * being buffered in a {@link List} as they are being consumed. + * + * This should be deprecated as we slowly move towards using {@link SerializedBufferingStrategy} + * instead. + */ +public class InMemoryRecordBufferingStrategy implements BufferingStrategy { + + private static final Logger LOGGER = LoggerFactory.getLogger(InMemoryRecordBufferingStrategy.class); + + private Map> streamBuffer = new HashMap<>(); + private final RecordWriter recordWriter; + private final CheckAndRemoveRecordWriter checkAndRemoveRecordWriter; + private String fileName; + + private final RecordSizeEstimator recordSizeEstimator; + private final long maxQueueSizeInBytes; + private long bufferSizeInBytes; + + public InMemoryRecordBufferingStrategy(final RecordWriter recordWriter, + final long maxQueueSizeInBytes) { + this(recordWriter, null, maxQueueSizeInBytes); + } + + public InMemoryRecordBufferingStrategy(final RecordWriter recordWriter, + final CheckAndRemoveRecordWriter checkAndRemoveRecordWriter, + final long maxQueueSizeInBytes) { + this.recordWriter = recordWriter; + this.checkAndRemoveRecordWriter = checkAndRemoveRecordWriter; + + this.maxQueueSizeInBytes = maxQueueSizeInBytes; + this.bufferSizeInBytes = 0; + this.recordSizeEstimator = new RecordSizeEstimator(); + } + + @Override + public Optional addRecord(final AirbyteStreamNameNamespacePair stream, final AirbyteMessage message) throws Exception { + Optional flushed = Optional.empty(); + + final long messageSizeInBytes = recordSizeEstimator.getEstimatedByteSize(message.getRecord()); + if (bufferSizeInBytes + messageSizeInBytes > maxQueueSizeInBytes) { + flushAllBuffers(); + flushed = Optional.of(BufferFlushType.FLUSH_ALL); + } + + final List bufferedRecords = streamBuffer.computeIfAbsent(stream, k -> new ArrayList<>()); + bufferedRecords.add(message.getRecord()); + bufferSizeInBytes += messageSizeInBytes; + + return flushed; + } + + @Override + public void flushSingleBuffer(final AirbyteStreamNameNamespacePair stream, final SerializableBuffer buffer) throws Exception { + LOGGER.info("Flushing single stream {}: {} records", stream.getName(), streamBuffer.get(stream).size()); + recordWriter.accept(stream, streamBuffer.get(stream)); + LOGGER.info("Flushing completed for {}", stream.getName()); + } + + @Override + public void flushAllBuffers() throws Exception { + for (final Map.Entry> entry : streamBuffer.entrySet()) { + LOGGER.info("Flushing {}: {} records ({})", entry.getKey().getName(), entry.getValue().size(), + FileUtils.byteCountToDisplaySize(bufferSizeInBytes)); + recordWriter.accept(entry.getKey(), entry.getValue()); + if (checkAndRemoveRecordWriter != null) { + fileName = checkAndRemoveRecordWriter.apply(entry.getKey(), fileName); + } + LOGGER.info("Flushing completed for {}", entry.getKey().getName()); + } + close(); + clear(); + bufferSizeInBytes = 0; + } + + @Override + public void clear() { + streamBuffer = new HashMap<>(); + } + + @Override + public void close() throws Exception {} + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/SerializableBuffer.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/SerializableBuffer.java new file mode 100644 index 0000000000000..2762ab055ecb9 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/SerializableBuffer.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; + +/** + * A {@link SerializableBuffer} is designed to be used as part of a + * {@link SerializedBufferingStrategy}. + * + *

+ * It encapsulates the actual implementation of a buffer: both the medium storage (usually defined + * as part of {@link BufferStorage}. and the format of the serialized data when it is written to the + * buffer. + *

+ * + *

+ * A {@link BaseSerializedBuffer} class is provided, and should be the expected class to derive from + * when implementing a new format of buffer. The storage aspects are normally provided through + * composition of {@link BufferStorage}. + *

+ * + */ +public interface SerializableBuffer extends AutoCloseable { + + /** + * Adds a {@link AirbyteRecordMessage} to the buffer and returns the size of the message in bytes + * + * @param record {@link AirbyteRecordMessage} to be added to buffer + * @return number of bytes written to the buffer + */ + long accept(AirbyteRecordMessage record) throws Exception; + + /** + * Flush a buffer implementation. + */ + void flush() throws Exception; + + /** + * The buffer implementation should be keeping track of how many bytes it accumulated so far. If any + * flush events were triggered, the amount of bytes accumulated would also have been decreased + * accordingly. This method @return such statistics. + */ + long getByteCount(); + + /** + * @return the filename representation of this buffer. + */ + String getFilename() throws IOException; + + /** + * @return a temporary representation as a file of this buffer. + */ + File getFile() throws IOException; + + /** + * @return the InputStream to read data back from this buffer once it is done adding messages to it. + */ + InputStream getInputStream() throws FileNotFoundException; + + /* + * Depending on the implementation of the storage, methods below defined reasonable thresholds + * associated with using this kind of buffer implementation. + * + * These could also be dynamically configured/tuned at runtime if needed (from user input for + * example?) + */ + + /** + * @return How much storage should be used overall by all buffers + */ + long getMaxTotalBufferSizeInBytes(); + + /** + * @return How much storage should be used for a particular stream at a time before flushing it + */ + long getMaxPerStreamBufferSizeInBytes(); + + /** + * @return How many concurrent buffers can be handled at once in parallel + */ + int getMaxConcurrentStreamsInBuffer(); + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/SerializedBufferingStrategy.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/SerializedBufferingStrategy.java new file mode 100644 index 0000000000000..d69451440e031 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/destination/record_buffer/SerializedBufferingStrategy.java @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import io.airbyte.commons.string.Strings; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Optional; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Buffering Strategy used to convert {@link io.airbyte.protocol.models.AirbyteRecordMessage} into a + * stream of bytes to more readily save and transmit information + * + *

+ * This class is meant to be used in conjunction with {@link SerializableBuffer} + *

+ */ +public class SerializedBufferingStrategy implements BufferingStrategy { + + private static final Logger LOGGER = LoggerFactory.getLogger(SerializedBufferingStrategy.class); + + private final BufferCreateFunction onCreateBuffer; + private final FlushBufferFunction onStreamFlush; + + private Map allBuffers = new HashMap<>(); + private long totalBufferSizeInBytes; + private final ConfiguredAirbyteCatalog catalog; + + /** + * Creates instance of Serialized Buffering Strategy used to handle the logic of flushing buffer + * with an associated buffer type + * + * @param onCreateBuffer type of buffer used upon creation + * @param catalog collection of {@link io.airbyte.protocol.models.ConfiguredAirbyteStream} + * @param onStreamFlush buffer flush logic used throughout the streaming of messages + */ + public SerializedBufferingStrategy(final BufferCreateFunction onCreateBuffer, + final ConfiguredAirbyteCatalog catalog, + final FlushBufferFunction onStreamFlush) { + this.onCreateBuffer = onCreateBuffer; + this.catalog = catalog; + this.onStreamFlush = onStreamFlush; + this.totalBufferSizeInBytes = 0; + } + + /** + * Handles both adding records and when buffer is full to also flush + * + * @param stream stream associated with record + * @param message {@link AirbyteMessage} to buffer + * @return Optional which contains a {@link BufferFlushType} if a flush occurred, otherwise empty) + * @throws Exception + */ + @Override + public Optional addRecord(final AirbyteStreamNameNamespacePair stream, final AirbyteMessage message) throws Exception { + Optional flushed = Optional.empty(); + + final SerializableBuffer buffer = getOrCreateBuffer(stream); + if (buffer == null) { + throw new RuntimeException(String.format("Failed to create/get buffer for stream %s.%s", stream.getNamespace(), stream.getName())); + } + + final long actualMessageSizeInBytes = buffer.accept(message.getRecord()); + totalBufferSizeInBytes += actualMessageSizeInBytes; + // Flushes buffer when either the buffer was completely filled or only a single stream was filled + if (totalBufferSizeInBytes >= buffer.getMaxTotalBufferSizeInBytes() + || allBuffers.size() >= buffer.getMaxConcurrentStreamsInBuffer()) { + flushAllBuffers(); + flushed = Optional.of(BufferFlushType.FLUSH_ALL); + } else if (buffer.getByteCount() >= buffer.getMaxPerStreamBufferSizeInBytes()) { + flushSingleBuffer(stream, buffer); + /* + * Note: This branch is needed to indicate to the {@link DefaultDestStateLifeCycleManager} that an + * individual stream was flushed, there is no guarantee that it will flush records in the same order + * that state messages were received. The outcome here is that records get flushed but our updating + * of which state messages have been flushed falls behind. + * + * This is not ideal from a checkpoint point of view, because it means in the case where there is a + * failure, we will not be able to report that those records that were flushed and committed were + * committed because there corresponding state messages weren't marked as flushed. Thus, it weakens + * checkpointing, but it does not cause a correctness issue. + * + * In non-failure cases, using this conditional branch relies on the state messages getting flushed + * by some other means. That can be caused by the previous branch in this conditional. It is + * guaranteed by the fact that we always flush all state messages at the end of a sync. + */ + flushed = Optional.of(BufferFlushType.FLUSH_SINGLE_STREAM); + } + return flushed; + } + + /** + * Creates a new buffer for each stream if buffers do not already exist, else return already + * computed buffer + */ + private SerializableBuffer getOrCreateBuffer(final AirbyteStreamNameNamespacePair stream) { + return allBuffers.computeIfAbsent(stream, k -> { + LOGGER.info("Starting a new buffer for stream {} (current state: {} in {} buffers)", + stream.getName(), + FileUtils.byteCountToDisplaySize(totalBufferSizeInBytes), + allBuffers.size()); + try { + return onCreateBuffer.apply(stream, catalog); + } catch (final Exception e) { + LOGGER.error("Failed to create a new buffer for stream {}", stream.getName(), e); + throw new RuntimeException(e); + } + }); + } + + @Override + public void flushSingleBuffer(final AirbyteStreamNameNamespacePair stream, final SerializableBuffer buffer) throws Exception { + LOGGER.info("Flushing buffer of stream {} ({})", stream.getName(), FileUtils.byteCountToDisplaySize(buffer.getByteCount())); + onStreamFlush.accept(stream, buffer); + totalBufferSizeInBytes -= buffer.getByteCount(); + allBuffers.remove(stream); + LOGGER.info("Flushing completed for {}", stream.getName()); + } + + @Override + public void flushAllBuffers() throws Exception { + LOGGER.info("Flushing all {} current buffers ({} in total)", allBuffers.size(), FileUtils.byteCountToDisplaySize(totalBufferSizeInBytes)); + for (final Entry entry : allBuffers.entrySet()) { + final AirbyteStreamNameNamespacePair stream = entry.getKey(); + final SerializableBuffer buffer = entry.getValue(); + LOGGER.info("Flushing buffer of stream {} ({})", stream.getName(), FileUtils.byteCountToDisplaySize(buffer.getByteCount())); + onStreamFlush.accept(stream, buffer); + LOGGER.info("Flushing completed for {}", stream.getName()); + } + close(); + clear(); + totalBufferSizeInBytes = 0; + } + + @Override + public void clear() throws Exception { + LOGGER.debug("Reset all buffers"); + allBuffers = new HashMap<>(); + } + + @Override + public void close() throws Exception { + final List exceptionsThrown = new ArrayList<>(); + for (final Entry entry : allBuffers.entrySet()) { + try { + final AirbyteStreamNameNamespacePair stream = entry.getKey(); + LOGGER.info("Closing buffer for stream {}", stream.getName()); + final SerializableBuffer buffer = entry.getValue(); + buffer.close(); + } catch (final Exception e) { + exceptionsThrown.add(e); + LOGGER.error("Exception while closing stream buffer", e); + } + } + if (!exceptionsThrown.isEmpty()) { + throw new RuntimeException(String.format("Exceptions thrown while closing buffers: %s", Strings.join(exceptionsThrown, "\n"))); + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/ApmTraceUtils.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/ApmTraceUtils.java new file mode 100644 index 0000000000000..555c7d4dd6c86 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/ApmTraceUtils.java @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.util; + +import datadog.trace.api.DDTags; +import datadog.trace.api.interceptor.MutableSpan; +import io.opentracing.Span; +import io.opentracing.log.Fields; +import io.opentracing.tag.Tags; +import io.opentracing.util.GlobalTracer; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.Map; + +/** + * Collection of utility methods to help with performance tracing. + */ +public class ApmTraceUtils { + + /** + * String format for the name of tags added to spans. + */ + public static final String TAG_FORMAT = "airbyte.%s.%s"; + + /** + * Standard prefix for tags added to spans. + */ + public static final String TAG_PREFIX = "metadata"; + + /** + * Adds all the provided tags to the currently active span, if one exists.
+ * All tags added via this method will use the default {@link #TAG_PREFIX} namespace. + * + * @param tags A map of tags to be added to the currently active span. + */ + public static void addTagsToTrace(final Map tags) { + addTagsToTrace(tags, TAG_PREFIX); + } + + /** + * Adds all provided tags to the currently active span, if one exists, under the provided tag name + * namespace. + * + * @param tags A map of tags to be added to the currently active span. + * @param tagPrefix The prefix to be added to each custom tag name. + */ + public static void addTagsToTrace(final Map tags, final String tagPrefix) { + addTagsToTrace(GlobalTracer.get().activeSpan(), tags, tagPrefix); + } + + /** + * Adds all the provided tags to the provided span, if one exists. + * + * @param span The {@link Span} that will be associated with the tags. + * @param tags A map of tags to be added to the currently active span. + * @param tagPrefix The prefix to be added to each custom tag name. + */ + public static void addTagsToTrace(final Span span, final Map tags, final String tagPrefix) { + if (span != null) { + tags.entrySet().forEach(entry -> { + span.setTag(formatTag(entry.getKey(), tagPrefix), entry.getValue().toString()); + }); + } + } + + /** + * Adds an exception to the currently active span, if one exists. + * + * @param t The {@link Throwable} to be added to the currently active span. + */ + public static void addExceptionToTrace(final Throwable t) { + addExceptionToTrace(GlobalTracer.get().activeSpan(), t); + } + + /** + * Adds an exception to the provided span, if one exists. + * + * @param span The {@link Span} that will be associated with the exception. + * @param t The {@link Throwable} to be added to the provided span. + */ + public static void addExceptionToTrace(final Span span, final Throwable t) { + if (span != null) { + span.setTag(Tags.ERROR, true); + span.log(Map.of(Fields.ERROR_OBJECT, t)); + } + } + + /** + * Adds all the provided tags to the root span. + * + * @param tags A map of tags to be added to the root span. + */ + public static void addTagsToRootSpan(final Map tags) { + final Span activeSpan = GlobalTracer.get().activeSpan(); + if (activeSpan instanceof MutableSpan) { + final MutableSpan localRootSpan = ((MutableSpan) activeSpan).getLocalRootSpan(); + tags.entrySet().forEach(entry -> { + localRootSpan.setTag(formatTag(entry.getKey(), TAG_PREFIX), entry.getValue().toString()); + }); + } + } + + /** + * Adds an exception to the root span, if an active one exists. + * + * @param t The {@link Throwable} to be added to the provided span. + */ + public static void recordErrorOnRootSpan(final Throwable t) { + final Span activeSpan = GlobalTracer.get().activeSpan(); + if (activeSpan != null) { + activeSpan.setTag(Tags.ERROR, true); + activeSpan.log(Map.of(Fields.ERROR_OBJECT, t)); + } + if (activeSpan instanceof MutableSpan) { + final MutableSpan localRootSpan = ((MutableSpan) activeSpan).getLocalRootSpan(); + localRootSpan.setError(true); + localRootSpan.setTag(DDTags.ERROR_MSG, t.getMessage()); + localRootSpan.setTag(DDTags.ERROR_TYPE, t.getClass().getName()); + final StringWriter errorString = new StringWriter(); + t.printStackTrace(new PrintWriter(errorString)); + localRootSpan.setTag(DDTags.ERROR_STACK, errorString.toString()); + } + } + + /** + * Formats the tag key using {@link #TAG_FORMAT} provided by this utility, using the default tag + * prefix {@link #TAG_PREFIX}. + * + * @param tagKey The tag key to format. + * @return The formatted tag key. + */ + public static String formatTag(final String tagKey) { + return formatTag(tagKey, TAG_PREFIX); + } + + /** + * Formats the tag key using {@link #TAG_FORMAT} provided by this utility with the provided tag + * prefix. + * + * @param tagKey The tag key to format. + * @param tagPrefix The prefix to be added to each custom tag name. + * @return The formatted tag key. + */ + public static String formatTag(final String tagKey, final String tagPrefix) { + return String.format(TAG_FORMAT, tagPrefix, tagKey); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/ConnectorExceptionUtil.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/ConnectorExceptionUtil.java new file mode 100644 index 0000000000000..65d6428fcdc31 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/ConnectorExceptionUtil.java @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.util; + +import com.google.common.collect.ImmutableList; +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.commons.exceptions.ConnectionErrorException; +import io.airbyte.integrations.base.errors.messages.ErrorMessage; +import java.sql.SQLException; +import java.sql.SQLSyntaxErrorException; +import java.util.List; +import java.util.Locale; +import java.util.function.Predicate; + +/** + * Utility class defining methods for handling configuration exceptions in connectors. + */ +public class ConnectorExceptionUtil { + + public static final String COMMON_EXCEPTION_MESSAGE_TEMPLATE = "Could not connect with provided configuration. Error: %s"; + static final String RECOVERY_CONNECTION_ERROR_MESSAGE = + "We're having issues syncing from a Postgres replica that is configured as a hot standby server. " + + "Please see https://docs.airbyte.com/integrations/sources/postgres/#sync-data-from-postgres-hot-standby-server for options and workarounds"; + + public static final List HTTP_AUTHENTICATION_ERROR_CODES = ImmutableList.of(401, 403); + private static final List> configErrorPredicates = + List.of(getConfigErrorPredicate(), getConnectionErrorPredicate(), + isRecoveryConnectionExceptionPredicate(), isUnknownColumnInFieldListException()); + + public static boolean isConfigError(final Throwable e) { + return configErrorPredicates.stream().anyMatch(predicate -> predicate.test(e)); + } + + public static String getDisplayMessage(final Throwable e) { + if (e instanceof ConfigErrorException) { + return ((ConfigErrorException) e).getDisplayMessage(); + } else if (e instanceof ConnectionErrorException) { + final ConnectionErrorException connEx = (ConnectionErrorException) e; + return ErrorMessage.getErrorMessage(connEx.getStateCode(), connEx.getErrorCode(), connEx.getExceptionMessage(), connEx); + } else if (isRecoveryConnectionExceptionPredicate().test(e)) { + return RECOVERY_CONNECTION_ERROR_MESSAGE; + } else if (isUnknownColumnInFieldListException().test(e)) { + return e.getMessage(); + } else { + return String.format(COMMON_EXCEPTION_MESSAGE_TEMPLATE, e.getMessage() != null ? e.getMessage() : ""); + } + } + + /** + * Returns the first instance of an exception associated with a configuration error (if it exists). + * Otherwise, the original exception is returned. + */ + public static Throwable getRootConfigError(final Exception e) { + Throwable current = e; + while (current != null) { + if (ConnectorExceptionUtil.isConfigError(current)) { + return current; + } else { + current = current.getCause(); + } + } + return e; + } + + private static Predicate getConfigErrorPredicate() { + return e -> e instanceof ConfigErrorException; + } + + private static Predicate getConnectionErrorPredicate() { + return e -> e instanceof ConnectionErrorException; + } + + private static Predicate isRecoveryConnectionExceptionPredicate() { + return e -> e instanceof SQLException && e.getMessage() + .toLowerCase(Locale.ROOT) + .contains("due to conflict with recovery"); + } + + private static Predicate isUnknownColumnInFieldListException() { + return e -> e instanceof SQLSyntaxErrorException + && e.getMessage() + .toLowerCase(Locale.ROOT) + .contains("unknown column") + && e.getMessage() + .toLowerCase(Locale.ROOT) + .contains("in 'field list'"); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/HostPortResolver.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/HostPortResolver.java new file mode 100644 index 0000000000000..89eaa857a916e --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/HostPortResolver.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.util; + +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Objects; +import org.testcontainers.containers.GenericContainer; + +public class HostPortResolver { + + public static String resolveHost(GenericContainer container) { + return System.getProperty("os.name").toLowerCase().startsWith("mac") + ? getIpAddress(container) + : container.getHost(); + } + + public static int resolvePort(GenericContainer container) { + return System.getProperty("os.name").toLowerCase().startsWith("mac") ? (Integer) container.getExposedPorts().get(0) + : container.getFirstMappedPort(); + } + + public static String resolveIpAddress(GenericContainer container) { + return getIpAddress(container); + } + + public static String encodeValue(final String value) { + if (value != null) { + return URLEncoder.encode(value, StandardCharsets.UTF_8); + } + return null; + } + + private static String getIpAddress(GenericContainer container) { + return Objects.requireNonNull(container.getContainerInfo() + .getNetworkSettings() + .getNetworks() + .entrySet().stream() + .findFirst() + .get().getValue().getIpAddress()); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/PostgresSslConnectionUtils.java b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/PostgresSslConnectionUtils.java new file mode 100644 index 0000000000000..d5ab7f8c7a70f --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/java/io/airbyte/integrations/util/PostgresSslConnectionUtils.java @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.util; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.RandomStringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PostgresSslConnectionUtils { + + private static final Logger LOGGER = LoggerFactory.getLogger(PostgresSslConnectionUtils.class); + private static final String CA_CERTIFICATE = "ca.crt"; + private static final String CLIENT_CERTIFICATE = "client.crt"; + private static final String CLIENT_KEY = "client.key"; + private static final String CLIENT_ENCRYPTED_KEY = "client.pk8"; + + public static final String PARAM_MODE = "mode"; + public static final String PARAM_SSL = "ssl"; + public static final String PARAM_SSL_MODE = "ssl_mode"; + public static final String PARAM_SSLMODE = "sslmode"; + public static final String PARAM_CLIENT_KEY_PASSWORD = "client_key_password"; + public static final String PARAM_CA_CERTIFICATE = "ca_certificate"; + public static final String PARAM_CLIENT_CERTIFICATE = "client_certificate"; + public static final String PARAM_CLIENT_KEY = "client_key"; + + public static final String VERIFY_CA = "verify-ca"; + public static final String VERIFY_FULL = "verify-full"; + public static final String DISABLE = "disable"; + public static final String TRUE_STRING_VALUE = "true"; + public static final String ENCRYPT_FILE_NAME = "encrypt"; + public static final String FACTORY_VALUE = "org.postgresql.ssl.DefaultJavaSSLFactory"; + + public static Map obtainConnectionOptions(final JsonNode encryption) { + final Map additionalParameters = new HashMap<>(); + if (!encryption.isNull()) { + final var method = encryption.get(PARAM_MODE).asText(); + var keyStorePassword = checkOrCreatePassword(encryption); + switch (method) { + case VERIFY_CA -> { + additionalParameters.putAll(obtainConnectionCaOptions(encryption, method, keyStorePassword)); + } + case VERIFY_FULL -> { + additionalParameters.putAll(obtainConnectionFullOptions(encryption, method, keyStorePassword)); + } + default -> { + additionalParameters.put(PARAM_SSL, TRUE_STRING_VALUE); + additionalParameters.put(PARAM_SSLMODE, method); + } + } + } + return additionalParameters; + } + + private static String checkOrCreatePassword(final JsonNode encryption) { + String sslPassword = encryption.has(PARAM_CLIENT_KEY_PASSWORD) ? encryption.get(PARAM_CLIENT_KEY_PASSWORD).asText() : ""; + var keyStorePassword = RandomStringUtils.randomAlphanumeric(10); + if (sslPassword.isEmpty()) { + var file = new File(ENCRYPT_FILE_NAME); + if (file.exists()) { + keyStorePassword = readFile(file); + } else { + try { + createCertificateFile(ENCRYPT_FILE_NAME, keyStorePassword); + } catch (final IOException e) { + throw new RuntimeException("Failed to create encryption file "); + } + } + } else { + keyStorePassword = sslPassword; + } + return keyStorePassword; + } + + private static String readFile(final File file) { + try { + BufferedReader reader = new BufferedReader(new FileReader(file, StandardCharsets.UTF_8)); + String currentLine = reader.readLine(); + reader.close(); + return currentLine; + } catch (final IOException e) { + throw new RuntimeException("Failed to read file with encryption"); + } + } + + private static Map obtainConnectionFullOptions(final JsonNode encryption, + final String method, + final String clientKeyPassword) { + final Map additionalParameters = new HashMap<>(); + try { + convertAndImportFullCertificate(encryption.get(PARAM_CA_CERTIFICATE).asText(), + encryption.get(PARAM_CLIENT_CERTIFICATE).asText(), encryption.get(PARAM_CLIENT_KEY).asText(), clientKeyPassword); + } catch (final IOException | InterruptedException e) { + throw new RuntimeException("Failed to import certificate into Java Keystore"); + } + additionalParameters.put("ssl", TRUE_STRING_VALUE); + additionalParameters.put("sslmode", method); + additionalParameters.put("sslrootcert", CA_CERTIFICATE); + additionalParameters.put("sslcert", CLIENT_CERTIFICATE); + additionalParameters.put("sslkey", CLIENT_ENCRYPTED_KEY); + additionalParameters.put("sslfactory", FACTORY_VALUE); + return additionalParameters; + } + + private static Map obtainConnectionCaOptions(final JsonNode encryption, + final String method, + final String clientKeyPassword) { + final Map additionalParameters = new HashMap<>(); + try { + convertAndImportCaCertificate(encryption.get(PARAM_CA_CERTIFICATE).asText(), clientKeyPassword); + } catch (final IOException | InterruptedException e) { + throw new RuntimeException("Failed to import certificate into Java Keystore"); + } + additionalParameters.put("ssl", TRUE_STRING_VALUE); + additionalParameters.put("sslmode", method); + additionalParameters.put("sslrootcert", CA_CERTIFICATE); + additionalParameters.put("sslfactory", FACTORY_VALUE); + return additionalParameters; + } + + private static void convertAndImportFullCertificate(final String caCertificate, + final String clientCertificate, + final String clientKey, + final String clientKeyPassword) + throws IOException, InterruptedException { + final Runtime run = Runtime.getRuntime(); + createCaCertificate(caCertificate, clientKeyPassword, run); + createCertificateFile(CLIENT_CERTIFICATE, clientCertificate); + createCertificateFile(CLIENT_KEY, clientKey); + // add client certificate to the custom keystore + runProcess("keytool -alias client-certificate -keystore customkeystore" + + " -import -file " + CLIENT_CERTIFICATE + " -storepass " + clientKeyPassword + " -noprompt", run); + // convert client.key to client.pk8 based on the documentation + runProcess("openssl pkcs8 -topk8 -inform PEM -in " + CLIENT_KEY + " -outform DER -out " + + CLIENT_ENCRYPTED_KEY + " -nocrypt", run); + runProcess("rm " + CLIENT_KEY, run); + + updateTrustStoreSystemProperty(clientKeyPassword); + } + + private static void convertAndImportCaCertificate(final String caCertificate, + final String clientKeyPassword) + throws IOException, InterruptedException { + final Runtime run = Runtime.getRuntime(); + createCaCertificate(caCertificate, clientKeyPassword, run); + updateTrustStoreSystemProperty(clientKeyPassword); + } + + private static void createCaCertificate(final String caCertificate, + final String clientKeyPassword, + final Runtime run) + throws IOException, InterruptedException { + createCertificateFile(CA_CERTIFICATE, caCertificate); + // add CA certificate to the custom keystore + runProcess("keytool -import -alias rds-root -keystore customkeystore" + + " -file " + CA_CERTIFICATE + " -storepass " + clientKeyPassword + " -noprompt", run); + } + + private static void updateTrustStoreSystemProperty(final String clientKeyPassword) { + String result = System.getProperty("user.dir") + "/customkeystore"; + System.setProperty("javax.net.ssl.trustStore", result); + System.setProperty("javax.net.ssl.trustStorePassword", clientKeyPassword); + } + + private static void createCertificateFile(String fileName, String fileValue) throws IOException { + try (final PrintWriter out = new PrintWriter(fileName, StandardCharsets.UTF_8)) { + out.print(fileValue); + } + } + + private static void runProcess(final String cmd, final Runtime run) throws IOException, InterruptedException { + final Process pr = run.exec(cmd); + if (!pr.waitFor(30, TimeUnit.SECONDS)) { + pr.destroy(); + throw new RuntimeException("Timeout while executing: " + cmd); + } + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/resources/AirbyteLogMessageTemplate.json b/airbyte-integrations/bases/base-java-async/src/main/resources/AirbyteLogMessageTemplate.json new file mode 100644 index 0000000000000..ea1e0f9f7b402 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/resources/AirbyteLogMessageTemplate.json @@ -0,0 +1,21 @@ +{ + "type": "LOG", + "log": { + "level": { + "$resolver": "level", + "field": "name" + }, + "message": { + "$resolver": "pattern", + "pattern": "%level %C{1.}(%M):%L %m", + "stringified": true + }, + "stack_trace": { + "$resolver": "exception", + "field": "stackTrace", + "stackTrace": { + "stringified": true + } + } + } +} diff --git a/airbyte-integrations/bases/base-java-async/src/main/resources/bastion/Dockerfile b/airbyte-integrations/bases/base-java-async/src/main/resources/bastion/Dockerfile new file mode 100644 index 0000000000000..e50bfde6aed9d --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/resources/bastion/Dockerfile @@ -0,0 +1,25 @@ +FROM ubuntu:18.04 + +RUN apt-get update && apt-get install -y openssh-server +RUN apt-get install -y apt-utils +RUN mkdir /var/run/sshd +RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config + +RUN useradd -m -s /bin/bash sshuser +RUN echo "sshuser:secret" | chpasswd + +RUN mkdir /var/bastion +RUN ssh-keygen -m PEM -t rsa -b 4096 -C "test-container-bastion" -P "" -f /var/bastion/id_rsa -q +RUN install -D /var/bastion/id_rsa.pub /home/sshuser/.ssh/authorized_keys + +RUN chown -R sshuser:sshuser /home/sshuser/.ssh +RUN chmod 600 /home/sshuser/.ssh/authorized_keys + +RUN mkdir /root/.ssh + +RUN apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +EXPOSE 22 + +CMD ["/usr/sbin/sshd", "-D"] diff --git a/airbyte-integrations/bases/base-java-async/src/main/resources/log4j2.xml b/airbyte-integrations/bases/base-java-async/src/main/resources/log4j2.xml new file mode 100644 index 0000000000000..81e76194de838 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/resources/log4j2.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/airbyte-integrations/bases/base-java-async/src/main/resources/ssh-tunnel-spec.json b/airbyte-integrations/bases/base-java-async/src/main/resources/ssh-tunnel-spec.json new file mode 100644 index 0000000000000..4597f533a0341 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/main/resources/ssh-tunnel-spec.json @@ -0,0 +1,114 @@ +{ + "type": "object", + "title": "SSH Tunnel Method", + "description": "Whether to initiate an SSH tunnel before connecting to the database, and if so, which kind of authentication to use.", + "oneOf": [ + { + "title": "No Tunnel", + "required": ["tunnel_method"], + "properties": { + "tunnel_method": { + "description": "No ssh tunnel needed to connect to database", + "type": "string", + "const": "NO_TUNNEL", + "order": 0 + } + } + }, + { + "title": "SSH Key Authentication", + "required": [ + "tunnel_method", + "tunnel_host", + "tunnel_port", + "tunnel_user", + "ssh_key" + ], + "properties": { + "tunnel_method": { + "description": "Connect through a jump server tunnel host using username and ssh key", + "type": "string", + "const": "SSH_KEY_AUTH", + "order": 0 + }, + "tunnel_host": { + "title": "SSH Tunnel Jump Server Host", + "description": "Hostname of the jump server host that allows inbound ssh tunnel.", + "type": "string", + "order": 1 + }, + "tunnel_port": { + "title": "SSH Connection Port", + "description": "Port on the proxy/jump server that accepts inbound ssh connections.", + "type": "integer", + "minimum": 0, + "maximum": 65536, + "default": 22, + "examples": ["22"], + "order": 2 + }, + "tunnel_user": { + "title": "SSH Login Username", + "description": "OS-level username for logging into the jump server host.", + "type": "string", + "order": 3 + }, + "ssh_key": { + "title": "SSH Private Key", + "description": "OS-level user account ssh key credentials in RSA PEM format ( created with ssh-keygen -t rsa -m PEM -f myuser_rsa )", + "type": "string", + "airbyte_secret": true, + "multiline": true, + "order": 4 + } + } + }, + { + "title": "Password Authentication", + "required": [ + "tunnel_method", + "tunnel_host", + "tunnel_port", + "tunnel_user", + "tunnel_user_password" + ], + "properties": { + "tunnel_method": { + "description": "Connect through a jump server tunnel host using username and password authentication", + "type": "string", + "const": "SSH_PASSWORD_AUTH", + "order": 0 + }, + "tunnel_host": { + "title": "SSH Tunnel Jump Server Host", + "description": "Hostname of the jump server host that allows inbound ssh tunnel.", + "type": "string", + "order": 1 + }, + "tunnel_port": { + "title": "SSH Connection Port", + "description": "Port on the proxy/jump server that accepts inbound ssh connections.", + "type": "integer", + "minimum": 0, + "maximum": 65536, + "default": 22, + "examples": ["22"], + "order": 2 + }, + "tunnel_user": { + "title": "SSH Login Username", + "description": "OS-level username for logging into the jump server host", + "type": "string", + "order": 3 + }, + "tunnel_user_password": { + "title": "Password", + "description": "OS-level password for logging into the jump server host", + "type": "string", + "airbyte_secret": true, + "order": 4 + } + } + } + ] +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteExceptionHandlerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteExceptionHandlerTest.java new file mode 100644 index 0000000000000..8729bca1f8d98 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteExceptionHandlerTest.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.spy; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import lombok.SneakyThrows; +import org.junit.After; +import org.junit.Before; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; +import org.slf4j.LoggerFactory; + +public class AirbyteExceptionHandlerTest { + + PrintStream originalOut = System.out; + private volatile ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + + @Before + public void setUpOut() { + System.setOut(new PrintStream(outContent, true, StandardCharsets.UTF_8)); + } + + @Test + void testTraceMessageEmission() throws Exception { + // mocking terminate() method in AirbyteExceptionHandler, so we don't kill the JVM + AirbyteExceptionHandler airbyteExceptionHandler = spy(new AirbyteExceptionHandler()); + doNothing().when(airbyteExceptionHandler).terminate(); + + // have to spawn a new thread to test the uncaught exception handling, + // because junit catches any exceptions in main thread, i.e. they're not 'uncaught' + Thread thread = new Thread() { + + @SneakyThrows + public void run() { + setUpOut(); + final IntegrationRunner runner = Mockito.mock(IntegrationRunner.class); + doThrow(new RuntimeException("error")).when(runner).run(new String[] {"write"}); + runner.run(new String[] {"write"}); + } + + }; + thread.setUncaughtExceptionHandler(airbyteExceptionHandler); + thread.start(); + thread.join(); + System.out.flush(); + revertOut(); + + // now we turn the std out from the thread into json and check it's the expected TRACE message + JsonNode traceMsgJson = Jsons.deserialize(outContent.toString(StandardCharsets.UTF_8)); + LoggerFactory.getLogger(AirbyteExceptionHandlerTest.class).debug(traceMsgJson.toString()); + Assertions.assertEquals("TRACE", traceMsgJson.get("type").asText()); + Assertions.assertEquals("ERROR", traceMsgJson.get("trace").get("type").asText()); + Assertions.assertEquals(AirbyteExceptionHandler.logMessage, traceMsgJson.get("trace").get("error").get("message").asText()); + Assertions.assertEquals("system_error", traceMsgJson.get("trace").get("error").get("failure_type").asText()); + } + + @After + public void revertOut() { + System.setOut(originalOut); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteLogMessageTemplateTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteLogMessageTemplateTest.java new file mode 100644 index 0000000000000..6862221f3d8ed --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteLogMessageTemplateTest.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.AirbyteLogMessage; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteMessage.Type; +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.LoggerContext; +import org.apache.logging.log4j.core.appender.OutputStreamAppender; +import org.apache.logging.log4j.core.config.Configuration; +import org.apache.logging.log4j.core.config.LoggerConfig; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.platform.commons.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AirbyteLogMessageTemplateTest { + + private static final ByteArrayOutputStream outputContent = new ByteArrayOutputStream(); + private static final Logger LOGGER = LoggerFactory.getLogger(AirbyteLogMessageTemplateTest.class); + public static final String OUTPUT_STREAM_APPENDER = "OutputStreamAppender"; + public static final String CONSOLE_JSON_APPENDER = "ConsoleJSONAppender"; + private static OutputStreamAppender outputStreamAppender; + private static LoggerConfig rootLoggerConfig; + + @BeforeAll + static void init() { + // We are creating a log appender with the same output pattern + // as the console json appender defined in this project's log4j2.xml file. + // We then attach this log appender with the LOGGER instance so that we can validate the logs + // produced by code and assert that it matches the expected format. + final LoggerContext loggerContext = (LoggerContext) LogManager.getContext(false); + final Configuration configuration = loggerContext.getConfiguration(); + rootLoggerConfig = configuration.getLoggerConfig(""); + + outputStreamAppender = OutputStreamAppender.createAppender( + rootLoggerConfig.getAppenders().get(CONSOLE_JSON_APPENDER).getLayout(), + null, outputContent, OUTPUT_STREAM_APPENDER, false, true); + outputStreamAppender.start(); + + rootLoggerConfig.addAppender(outputStreamAppender, Level.ALL, null); + } + + @BeforeEach + void setup() { + outputContent.reset(); + } + + @AfterAll + static void cleanUp() { + outputStreamAppender.stop(); + rootLoggerConfig.removeAppender(OUTPUT_STREAM_APPENDER); + } + + @Test + public void testAirbyteLogMessageFormat() throws java.io.IOException { + LOGGER.info("hello"); + + outputContent.flush(); + final String logMessage = outputContent.toString(StandardCharsets.UTF_8); + final AirbyteMessage airbyteMessage = validateLogIsAirbyteMessage(logMessage); + final AirbyteLogMessage airbyteLogMessage = validateAirbyteMessageIsLog(airbyteMessage); + + final String connectorLogMessage = airbyteLogMessage.getMessage(); + // validate that the message inside AirbyteLogMessage matches the pattern. + // pattern to check for is: LOG_LEVEL className(methodName):LineNumber logMessage + final String connectorLogMessageRegex = + "^INFO [\\w+.]*.AirbyteLogMessageTemplateTest\\(testAirbyteLogMessageFormat\\):\\d+ hello$"; + final Pattern pattern = Pattern.compile(connectorLogMessageRegex); + + final Matcher matcher = pattern.matcher(connectorLogMessage); + assertTrue(matcher.matches(), connectorLogMessage); + } + + private AirbyteMessage validateLogIsAirbyteMessage(final String logMessage) { + final Optional jsonLine = Jsons.tryDeserialize(logMessage); + assertFalse(jsonLine.isEmpty()); + + final Optional m = Jsons.tryObject(jsonLine.get(), AirbyteMessage.class); + assertFalse(m.isEmpty()); + return m.get(); + } + + private AirbyteLogMessage validateAirbyteMessageIsLog(final AirbyteMessage airbyteMessage) { + assertEquals(Type.LOG, airbyteMessage.getType()); + assertNotNull(airbyteMessage.getLog()); + assertFalse(StringUtils.isBlank(airbyteMessage.getLog().getMessage())); + return airbyteMessage.getLog(); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteTraceMessageUtilityTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteTraceMessageUtilityTest.java new file mode 100644 index 0000000000000..c5f7db19131a4 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/AirbyteTraceMessageUtilityTest.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteErrorTraceMessage.FailureType; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +public class AirbyteTraceMessageUtilityTest { + + PrintStream originalOut = System.out; + private final ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + + @BeforeEach + public void setUpOut() { + System.setOut(new PrintStream(outContent, true, StandardCharsets.UTF_8)); + } + + private void assertJsonNodeIsTraceMessage(JsonNode jsonNode) { + // todo: this check could be better by actually trying to convert the JsonNode to an + // AirbyteTraceMessage instance + Assertions.assertEquals("TRACE", jsonNode.get("type").asText()); + Assertions.assertNotNull(jsonNode.get("trace")); + } + + @Test + void testEmitSystemErrorTrace() { + AirbyteTraceMessageUtility.emitSystemErrorTrace(Mockito.mock(RuntimeException.class), "this is a system error"); + JsonNode outJson = Jsons.deserialize(outContent.toString(StandardCharsets.UTF_8)); + assertJsonNodeIsTraceMessage(outJson); + Assertions.assertEquals("system_error", outJson.get("trace").get("error").get("failure_type").asText()); + } + + @Test + void testEmitConfigErrorTrace() { + AirbyteTraceMessageUtility.emitConfigErrorTrace(Mockito.mock(RuntimeException.class), "this is a config error"); + JsonNode outJson = Jsons.deserialize(outContent.toString(StandardCharsets.UTF_8)); + assertJsonNodeIsTraceMessage(outJson); + Assertions.assertEquals("config_error", outJson.get("trace").get("error").get("failure_type").asText()); + } + + @Test + void testEmitErrorTrace() { + AirbyteTraceMessageUtility.emitErrorTrace(Mockito.mock(RuntimeException.class), "this is an error", FailureType.SYSTEM_ERROR); + assertJsonNodeIsTraceMessage(Jsons.deserialize(outContent.toString(StandardCharsets.UTF_8))); + } + + @Test + void testCorrectStacktraceFormat() { + try { + int x = 1 / 0; + } catch (Exception e) { + AirbyteTraceMessageUtility.emitSystemErrorTrace(e, "you exploded the universe"); + } + JsonNode outJson = Jsons.deserialize(outContent.toString(StandardCharsets.UTF_8)); + Assertions.assertTrue(outJson.get("trace").get("error").get("stack_trace").asText().contains("\n\tat")); + } + + @AfterEach + public void revertOut() { + System.setOut(originalOut); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/FailureTrackingAirbyteMessageConsumerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/FailureTrackingAirbyteMessageConsumerTest.java new file mode 100644 index 0000000000000..dba9eb0483a41 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/FailureTrackingAirbyteMessageConsumerTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import org.junit.jupiter.api.Test; + +class FailureTrackingAirbyteMessageConsumerTest { + + @Test + void testStartNoFailure() throws Exception { + final TestConsumer consumer = spy(new TestConsumer()); + consumer.start(); + consumer.close(); + + verify(consumer).close(false); + } + + @Test + void testStartWithFailure() throws Exception { + final TestConsumer consumer = spy(new TestConsumer()); + doThrow(new RuntimeException()).when(consumer).startTracked(); + + // verify the exception still gets thrown. + assertThrows(RuntimeException.class, consumer::start); + consumer.close(); + + verify(consumer).close(true); + } + + @Test + void testAcceptNoFailure() throws Exception { + final TestConsumer consumer = spy(new TestConsumer()); + + final AirbyteMessage msg = mock(AirbyteMessage.class); + consumer.accept(msg); + consumer.close(); + + verify(consumer).close(false); + } + + @Test + void testAcceptWithFailure() throws Exception { + final TestConsumer consumer = spy(new TestConsumer()); + final AirbyteMessage msg = mock(AirbyteMessage.class); + when(msg.getType()).thenReturn(Type.RECORD); + doThrow(new RuntimeException()).when(consumer).acceptTracked(any()); + + // verify the exception still gets thrown. + assertThrows(RuntimeException.class, () -> consumer.accept(msg)); + consumer.close(); + + verify(consumer).close(true); + } + + static class TestConsumer extends FailureTrackingAirbyteMessageConsumer { + + @Override + protected void startTracked() { + + } + + @Override + protected void acceptTracked(final AirbyteMessage s) { + + } + + @Override + protected void close(final boolean hasFailed) { + + } + + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationCliParserTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationCliParserTest.java new file mode 100644 index 0000000000000..384e13347fdeb --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationCliParserTest.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.nio.file.Path; +import org.junit.jupiter.api.Test; + +class IntegrationCliParserTest { + + private static final String CONFIG_FILENAME = "config.json"; + private static final String CATALOG_FILENAME = "catalog.json"; + private static final String STATE_FILENAME = "state.json"; + + @Test + void testSpec() { + final String[] args = new String[] {"--spec"}; + final IntegrationConfig actual = new IntegrationCliParser().parse(args); + assertEquals(IntegrationConfig.spec(), actual); + } + + @Test + void testCheck() { + final String[] args = new String[] {"--check", "--config", CONFIG_FILENAME}; + final IntegrationConfig actual = new IntegrationCliParser().parse(args); + assertEquals(IntegrationConfig.check(Path.of(CONFIG_FILENAME)), actual); + } + + @Test + void testDiscover() { + final String[] args = new String[] {"--discover", "--config", CONFIG_FILENAME}; + final IntegrationConfig actual = new IntegrationCliParser().parse(args); + assertEquals(IntegrationConfig.discover(Path.of(CONFIG_FILENAME)), actual); + } + + @Test + void testWrite() { + final String[] args = new String[] {"--write", "--config", CONFIG_FILENAME, "--catalog", CATALOG_FILENAME}; + final IntegrationConfig actual = new IntegrationCliParser().parse(args); + assertEquals(IntegrationConfig.write(Path.of(CONFIG_FILENAME), Path.of(CATALOG_FILENAME)), actual); + } + + @Test + void testReadWithoutState() { + final String[] args = new String[] {"--read", "--config", CONFIG_FILENAME, "--catalog", CATALOG_FILENAME}; + final IntegrationConfig actual = new IntegrationCliParser().parse(args); + assertEquals(IntegrationConfig.read(Path.of(CONFIG_FILENAME), Path.of(CATALOG_FILENAME), null), actual); + } + + @Test + void testReadWithState() { + final String[] args = new String[] {"--read", "--config", CONFIG_FILENAME, "--catalog", CATALOG_FILENAME, "--state", STATE_FILENAME}; + final IntegrationConfig actual = new IntegrationCliParser().parse(args); + assertEquals(IntegrationConfig.read(Path.of(CONFIG_FILENAME), Path.of(CATALOG_FILENAME), Path.of(STATE_FILENAME)), actual); + } + + @Test + void testFailsOnUnknownArg() { + final String[] args = new String[] {"--check", "--config", CONFIG_FILENAME, "--random", "garbage"}; + assertThrows(IllegalArgumentException.class, () -> new IntegrationCliParser().parse(args)); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationConfigTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationConfigTest.java new file mode 100644 index 0000000000000..926fca719906b --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationConfigTest.java @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.nio.file.Path; +import java.util.Optional; +import org.junit.jupiter.api.Test; + +class IntegrationConfigTest { + + private static final Path CONFIG_PATH = Path.of("config.json"); + private static final Path CATALOG_PATH = Path.of("catalog.json"); + private static final Path STATE_PATH = Path.of("state.json"); + + @Test + void testSpec() { + final IntegrationConfig config = IntegrationConfig.spec(); + assertEquals(Command.SPEC, config.getCommand()); + assertThrows(IllegalStateException.class, config::getConfigPath); + assertThrows(IllegalStateException.class, config::getCatalogPath); + assertThrows(IllegalStateException.class, config::getStatePath); + } + + @Test + void testCheck() { + assertThrows(NullPointerException.class, () -> IntegrationConfig.check(null)); + + final IntegrationConfig config = IntegrationConfig.check(CONFIG_PATH); + assertEquals(Command.CHECK, config.getCommand()); + assertEquals(CONFIG_PATH, config.getConfigPath()); + assertThrows(IllegalStateException.class, config::getCatalogPath); + assertThrows(IllegalStateException.class, config::getStatePath); + } + + @Test + void testDiscover() { + assertThrows(NullPointerException.class, () -> IntegrationConfig.discover(null)); + + final IntegrationConfig config = IntegrationConfig.discover(CONFIG_PATH); + assertEquals(Command.DISCOVER, config.getCommand()); + assertEquals(CONFIG_PATH, config.getConfigPath()); + assertThrows(IllegalStateException.class, config::getCatalogPath); + assertThrows(IllegalStateException.class, config::getStatePath); + } + + @Test + void testWrite() { + assertThrows(NullPointerException.class, () -> IntegrationConfig.write(null, CATALOG_PATH)); + assertThrows(NullPointerException.class, () -> IntegrationConfig.write(CONFIG_PATH, null)); + + final IntegrationConfig config = IntegrationConfig.write(CONFIG_PATH, CATALOG_PATH); + assertEquals(Command.WRITE, config.getCommand()); + assertEquals(CONFIG_PATH, config.getConfigPath()); + assertEquals(CATALOG_PATH, config.getCatalogPath()); + assertThrows(IllegalStateException.class, config::getStatePath); + } + + @Test + void testReadWithState() { + assertThrows(NullPointerException.class, () -> IntegrationConfig.read(null, CATALOG_PATH, STATE_PATH)); + assertThrows(NullPointerException.class, () -> IntegrationConfig.read(CONFIG_PATH, null, STATE_PATH)); + + final IntegrationConfig config = IntegrationConfig.read(CONFIG_PATH, CATALOG_PATH, STATE_PATH); + assertEquals(Command.READ, config.getCommand()); + assertEquals(CONFIG_PATH, config.getConfigPath()); + assertEquals(CATALOG_PATH, config.getCatalogPath()); + assertEquals(Optional.of(STATE_PATH), config.getStatePath()); + } + + @Test + void testReadWithoutState() { + assertThrows(NullPointerException.class, () -> IntegrationConfig.read(null, CATALOG_PATH, null)); + assertThrows(NullPointerException.class, () -> IntegrationConfig.read(CONFIG_PATH, null, null)); + + final IntegrationConfig config = IntegrationConfig.read(CONFIG_PATH, CATALOG_PATH, null); + assertEquals(Command.READ, config.getCommand()); + assertEquals(CONFIG_PATH, config.getConfigPath()); + assertEquals(CATALOG_PATH, config.getCatalogPath()); + assertEquals(Optional.empty(), config.getStatePath()); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationRunnerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationRunnerTest.java new file mode 100644 index 0000000000000..866bf8e07aa17 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/IntegrationRunnerTest.java @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static io.airbyte.integrations.util.ConnectorExceptionUtil.COMMON_EXCEPTION_MESSAGE_TEMPLATE; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import static org.assertj.core.api.AssertionsForClassTypes.catchThrowable; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.commons.io.IOs; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.util.AutoCloseableIterators; +import io.airbyte.commons.util.MoreIterators; +import io.airbyte.protocol.models.v0.AirbyteCatalog; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus.Status; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStateMessage; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.CatalogHelpers; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConnectorSpecification; +import io.airbyte.validation.json.JsonSchemaValidator; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import org.apache.commons.lang3.ThreadUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.InOrder; +import org.mockito.Mockito; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IntegrationRunnerTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(IntegrationRunnerTest.class); + + private static final String CONFIG_FILE_NAME = "config.json"; + private static final String CONFIGURED_CATALOG_FILE_NAME = "configured_catalog.json"; + private static final String STATE_FILE_NAME = "state.json"; + + private static final String[] ARGS = new String[] {"args"}; + + private static final String CONFIG_STRING = "{ \"username\": \"airbyte\" }"; + private static final JsonNode CONFIG = Jsons.deserialize(CONFIG_STRING); + private static final String STREAM_NAME = "users"; + private static final Long EMITTED_AT = Instant.now().toEpochMilli(); + private static final Path TEST_ROOT = Path.of("/tmp/airbyte_tests"); + + private static final AirbyteCatalog CATALOG = new AirbyteCatalog().withStreams(Lists.newArrayList(new AirbyteStream().withName(STREAM_NAME))); + private static final ConfiguredAirbyteCatalog CONFIGURED_CATALOG = CatalogHelpers.toDefaultConfiguredCatalog(CATALOG); + private static final JsonNode STATE = Jsons.jsonNode(ImmutableMap.of("checkpoint", "05/08/1945")); + + private IntegrationCliParser cliParser; + private Consumer stdoutConsumer; + private Destination destination; + private Source source; + private Path configPath; + private Path configuredCatalogPath; + private Path statePath; + + @SuppressWarnings("unchecked") + @BeforeEach + void setup() throws IOException { + cliParser = mock(IntegrationCliParser.class); + stdoutConsumer = Mockito.mock(Consumer.class); + destination = mock(Destination.class); + source = mock(Source.class); + final Path configDir = Files.createTempDirectory(Files.createDirectories(TEST_ROOT), "test"); + + configPath = IOs.writeFile(configDir, CONFIG_FILE_NAME, CONFIG_STRING); + configuredCatalogPath = IOs.writeFile(configDir, CONFIGURED_CATALOG_FILE_NAME, Jsons.serialize(CONFIGURED_CATALOG)); + statePath = IOs.writeFile(configDir, STATE_FILE_NAME, Jsons.serialize(STATE)); + + final String testName = Thread.currentThread().getName(); + ThreadUtils.getAllThreads() + .stream() + .filter(runningThread -> !runningThread.isDaemon()) + .forEach(runningThread -> runningThread.setName(testName)); + } + + @Test + void testSpecSource() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.spec(); + final ConnectorSpecification output = new ConnectorSpecification().withDocumentationUrl(new URI("https://docs.airbyte.io/")); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.spec()).thenReturn(output); + + new IntegrationRunner(cliParser, stdoutConsumer, null, source).run(ARGS); + + verify(source).spec(); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.SPEC).withSpec(output)); + } + + @Test + void testSpecDestination() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.spec(); + final ConnectorSpecification output = new ConnectorSpecification().withDocumentationUrl(new URI("https://docs.airbyte.io/")); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(destination.spec()).thenReturn(output); + + new IntegrationRunner(cliParser, stdoutConsumer, destination, null).run(ARGS); + + verify(destination).spec(); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.SPEC).withSpec(output)); + } + + @Test + void testCheckSource() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.check(configPath); + final AirbyteConnectionStatus output = new AirbyteConnectionStatus().withStatus(Status.FAILED).withMessage("it failed"); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.check(CONFIG)).thenReturn(output); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(source.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + new IntegrationRunner(cliParser, stdoutConsumer, null, source, jsonSchemaValidator).run(ARGS); + + verify(source).check(CONFIG); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.CONNECTION_STATUS).withConnectionStatus(output)); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testCheckDestination() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.check(configPath); + final AirbyteConnectionStatus output = new AirbyteConnectionStatus().withStatus(Status.FAILED).withMessage("it failed"); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(destination.check(CONFIG)).thenReturn(output); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(destination.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + + new IntegrationRunner(cliParser, stdoutConsumer, destination, null, jsonSchemaValidator).run(ARGS); + + verify(destination).check(CONFIG); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.CONNECTION_STATUS).withConnectionStatus(output)); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testDiscover() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.discover(configPath); + final AirbyteCatalog output = new AirbyteCatalog() + .withStreams(Lists.newArrayList(new AirbyteStream().withName("oceans"))); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.discover(CONFIG)).thenReturn(output); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(source.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + new IntegrationRunner(cliParser, stdoutConsumer, null, source, jsonSchemaValidator).run(ARGS); + + verify(source).discover(CONFIG); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.CATALOG).withCatalog(output)); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testRead() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.read(configPath, configuredCatalogPath, + statePath); + final AirbyteMessage message1 = new AirbyteMessage().withType(Type.RECORD) + .withRecord(new AirbyteRecordMessage().withData(Jsons.jsonNode(ImmutableMap.of("names", "byron")))); + final AirbyteMessage message2 = new AirbyteMessage().withType(Type.RECORD).withRecord(new AirbyteRecordMessage() + .withData(Jsons.jsonNode(ImmutableMap.of("names", "reginald")))); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.read(CONFIG, CONFIGURED_CATALOG, STATE)) + .thenReturn(AutoCloseableIterators.fromIterator(MoreIterators.of(message1, message2))); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(source.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + new IntegrationRunner(cliParser, stdoutConsumer, null, source, jsonSchemaValidator).run(ARGS); + + verify(source).read(CONFIG, CONFIGURED_CATALOG, STATE); + verify(stdoutConsumer).accept(message1); + verify(stdoutConsumer).accept(message2); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testReadException() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.read(configPath, configuredCatalogPath, + statePath); + final ConfigErrorException configErrorException = new ConfigErrorException("Invalid configuration"); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.read(CONFIG, CONFIGURED_CATALOG, STATE)).thenThrow(configErrorException); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(source.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + final Throwable throwable = catchThrowable(() -> new IntegrationRunner(cliParser, stdoutConsumer, null, source, jsonSchemaValidator).run(ARGS)); + + assertThat(throwable).isInstanceOf(ConfigErrorException.class); + verify(source).read(CONFIG, CONFIGURED_CATALOG, STATE); + } + + @Test + void testCheckNestedException() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.check(configPath); + final AirbyteConnectionStatus output = new AirbyteConnectionStatus().withStatus(Status.FAILED).withMessage("Invalid configuration"); + final ConfigErrorException configErrorException = new ConfigErrorException("Invalid configuration"); + final RuntimeException runtimeException = new RuntimeException(new RuntimeException(configErrorException)); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.check(CONFIG)).thenThrow(runtimeException); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(source.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + new IntegrationRunner(cliParser, stdoutConsumer, null, source, jsonSchemaValidator).run(ARGS); + + verify(source).check(CONFIG); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.CONNECTION_STATUS).withConnectionStatus(output)); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testCheckRuntimeException() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.check(configPath); + final AirbyteConnectionStatus output = + new AirbyteConnectionStatus().withStatus(Status.FAILED).withMessage(String.format(COMMON_EXCEPTION_MESSAGE_TEMPLATE, "Runtime Error")); + final RuntimeException runtimeException = new RuntimeException("Runtime Error"); + + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(source.check(CONFIG)).thenThrow(runtimeException); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(source.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + new IntegrationRunner(cliParser, stdoutConsumer, null, source, jsonSchemaValidator).run(ARGS); + + verify(source).check(CONFIG); + verify(stdoutConsumer).accept(new AirbyteMessage().withType(Type.CONNECTION_STATUS).withConnectionStatus(output)); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testWrite() throws Exception { + final IntegrationConfig intConfig = IntegrationConfig.write(configPath, configuredCatalogPath); + final AirbyteMessageConsumer airbyteMessageConsumerMock = mock(AirbyteMessageConsumer.class); + when(cliParser.parse(ARGS)).thenReturn(intConfig); + when(destination.getConsumer(CONFIG, CONFIGURED_CATALOG, stdoutConsumer)).thenReturn(airbyteMessageConsumerMock); + + final ConnectorSpecification expectedConnSpec = mock(ConnectorSpecification.class); + when(destination.spec()).thenReturn(expectedConnSpec); + when(expectedConnSpec.getConnectionSpecification()).thenReturn(CONFIG); + + final JsonSchemaValidator jsonSchemaValidator = mock(JsonSchemaValidator.class); + + final IntegrationRunner runner = spy(new IntegrationRunner(cliParser, stdoutConsumer, destination, null, jsonSchemaValidator)); + runner.run(ARGS); + + verify(destination).getConsumer(CONFIG, CONFIGURED_CATALOG, stdoutConsumer); + verify(jsonSchemaValidator).validate(any(), any()); + } + + @Test + void testDestinationConsumerLifecycleSuccess() throws Exception { + final AirbyteMessage message1 = new AirbyteMessage() + .withType(AirbyteMessage.Type.RECORD) + .withRecord(new AirbyteRecordMessage() + .withData(Jsons.deserialize("{ \"color\": \"blue\" }")) + .withStream(STREAM_NAME) + .withEmittedAt(EMITTED_AT)); + final AirbyteMessage message2 = new AirbyteMessage() + .withType(AirbyteMessage.Type.RECORD) + .withRecord(new AirbyteRecordMessage() + .withData(Jsons.deserialize("{ \"color\": \"yellow\" }")) + .withStream(STREAM_NAME) + .withEmittedAt(EMITTED_AT)); + final AirbyteMessage stateMessage = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage() + .withData(Jsons.deserialize("{ \"checkpoint\": \"1\" }"))); + System.setIn(new ByteArrayInputStream((Jsons.serialize(message1) + "\n" + + Jsons.serialize(message2) + "\n" + + Jsons.serialize(stateMessage)).getBytes(StandardCharsets.UTF_8))); + + try (final AirbyteMessageConsumer airbyteMessageConsumerMock = mock(AirbyteMessageConsumer.class)) { + IntegrationRunner.consumeWriteStream(airbyteMessageConsumerMock); + final InOrder inOrder = inOrder(airbyteMessageConsumerMock); + inOrder.verify(airbyteMessageConsumerMock).accept(message1); + inOrder.verify(airbyteMessageConsumerMock).accept(message2); + inOrder.verify(airbyteMessageConsumerMock).accept(stateMessage); + } + } + + @Test + void testDestinationConsumerLifecycleFailure() throws Exception { + final AirbyteMessage message1 = new AirbyteMessage() + .withType(AirbyteMessage.Type.RECORD) + .withRecord(new AirbyteRecordMessage() + .withData(Jsons.deserialize("{ \"color\": \"blue\" }")) + .withStream(STREAM_NAME) + .withEmittedAt(EMITTED_AT)); + final AirbyteMessage message2 = new AirbyteMessage() + .withType(AirbyteMessage.Type.RECORD) + .withRecord(new AirbyteRecordMessage() + .withData(Jsons.deserialize("{ \"color\": \"yellow\" }")) + .withStream(STREAM_NAME) + .withEmittedAt(EMITTED_AT)); + System.setIn(new ByteArrayInputStream((Jsons.serialize(message1) + "\n" + Jsons.serialize(message2)).getBytes(StandardCharsets.UTF_8))); + + try (final AirbyteMessageConsumer airbyteMessageConsumerMock = mock(AirbyteMessageConsumer.class)) { + doThrow(new IOException("error")).when(airbyteMessageConsumerMock).accept(message1); + assertThrows(IOException.class, () -> IntegrationRunner.consumeWriteStream(airbyteMessageConsumerMock)); + final InOrder inOrder = inOrder(airbyteMessageConsumerMock); + inOrder.verify(airbyteMessageConsumerMock).accept(message1); + inOrder.verifyNoMoreInteractions(); + } + } + + @Test + void testInterruptOrphanThreadFailure() { + final String testName = Thread.currentThread().getName(); + final List caughtExceptions = new ArrayList<>(); + startSleepingThread(caughtExceptions, false); + assertThrows(IOException.class, () -> IntegrationRunner.watchForOrphanThreads( + () -> { + throw new IOException("random error"); + }, + Assertions::fail, + 3, TimeUnit.SECONDS, + 10, TimeUnit.SECONDS)); + try { + TimeUnit.SECONDS.sleep(15); + } catch (final Exception e) { + throw new RuntimeException(e); + } + final List runningThreads = ThreadUtils.getAllThreads().stream() + .filter(runningThread -> !runningThread.isDaemon() && !runningThread.getName().equals(testName)) + .collect(Collectors.toList()); + // all threads should be interrupted + assertEquals(List.of(), runningThreads); + assertEquals(1, caughtExceptions.size()); + } + + @Test + void testNoInterruptOrphanThreadFailure() { + final String testName = Thread.currentThread().getName(); + final List caughtExceptions = new ArrayList<>(); + final AtomicBoolean exitCalled = new AtomicBoolean(false); + startSleepingThread(caughtExceptions, true); + assertThrows(IOException.class, () -> IntegrationRunner.watchForOrphanThreads( + () -> { + throw new IOException("random error"); + }, + () -> exitCalled.set(true), + 3, TimeUnit.SECONDS, + 10, TimeUnit.SECONDS)); + try { + TimeUnit.SECONDS.sleep(15); + } catch (final Exception e) { + throw new RuntimeException(e); + } + final List runningThreads = ThreadUtils.getAllThreads().stream() + .filter(runningThread -> !runningThread.isDaemon() && !runningThread.getName().equals(testName)) + .collect(Collectors.toList()); + // a thread that refuses to be interrupted should remain + assertEquals(1, runningThreads.size()); + assertEquals(1, caughtExceptions.size()); + assertTrue(exitCalled.get()); + } + + private void startSleepingThread(final List caughtExceptions, final boolean ignoreInterrupt) { + final ExecutorService executorService = Executors.newFixedThreadPool(1); + executorService.submit(() -> { + for (int tries = 0; tries < 3; tries++) { + try { + TimeUnit.MINUTES.sleep(5); + } catch (final Exception e) { + LOGGER.info("Caught Exception", e); + caughtExceptions.add(e); + if (!ignoreInterrupt) { + executorService.shutdownNow(); + break; + } + } + } + }); + } + + @Test + void testParseConnectorImage() { + assertEquals("unknown", IntegrationRunner.parseConnectorVersion(null)); + assertEquals("unknown", IntegrationRunner.parseConnectorVersion("")); + assertEquals("1.0.1-alpha", IntegrationRunner.parseConnectorVersion("airbyte/destination-test:1.0.1-alpha")); + assertEquals("dev", IntegrationRunner.parseConnectorVersion("airbyte/destination-test:dev")); + assertEquals("1.0.1-alpha", IntegrationRunner.parseConnectorVersion("destination-test:1.0.1-alpha")); + assertEquals("1.0.1-alpha", IntegrationRunner.parseConnectorVersion(":1.0.1-alpha")); + } + + @Test + void testConsumptionOfInvalidStateMessage() { + final String invalidStateMessage = """ + { + "type" : "STATE", + "state" : { + "type": "NOT_RECOGNIZED", + "global": { + "streamStates": { + "foo" : "bar" + } + } + } + } + """; + + Assertions.assertThrows(IllegalStateException.class, () -> { + try (final AirbyteMessageConsumer consumer = mock(AirbyteMessageConsumer.class)) { + IntegrationRunner.consumeMessage(consumer, invalidStateMessage); + } + }); + } + + @Test + void testConsumptionOfInvalidNonStateMessage() { + final String invalidNonStateMessage = """ + { + "type" : "NOT_RECOGNIZED", + "record" : { + "namespace": "namespace", + "stream": "stream", + "emittedAt": 123456789 + } + } + """; + + Assertions.assertDoesNotThrow(() -> { + try (final AirbyteMessageConsumer consumer = mock(AirbyteMessageConsumer.class)) { + IntegrationRunner.consumeMessage(consumer, invalidNonStateMessage); + verify(consumer, times(0)).accept(any(AirbyteMessage.class)); + } + }); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/NameTransformerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/NameTransformerTest.java new file mode 100644 index 0000000000000..10e7794713119 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/NameTransformerTest.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import io.airbyte.integrations.destination.NamingConventionTransformer; +import io.airbyte.integrations.destination.StandardNameTransformer; +import org.junit.jupiter.api.Test; + +class NameTransformerTest { + + @Test + void testStandardSQLNaming() { + final NamingConventionTransformer namingResolver = new StandardNameTransformer(); + assertEquals("identifier_name", namingResolver.getIdentifier("identifier_name")); + assertEquals("iDenTiFieR_name", namingResolver.getIdentifier("iDenTiFieR_name")); + assertEquals("__identifier_name", namingResolver.getIdentifier("__identifier_name")); + assertEquals("IDENTIFIER_NAME", namingResolver.getIdentifier("IDENTIFIER_NAME")); + assertEquals("123identifier_name", namingResolver.getIdentifier("123identifier_name")); + assertEquals("i0d0e0n0t0i0f0i0e0r0n0a0m0e", namingResolver.getIdentifier("i0d0e0n0t0i0f0i0e0r0n0a0m0e")); + assertEquals("_identifier_name", namingResolver.getIdentifier(",identifier+name")); + assertEquals("identifier_name", namingResolver.getIdentifier("identifiêr name")); + assertEquals("a_unicode_name__", namingResolver.getIdentifier("a_unicode_name_文")); + assertEquals("identifier__name__", namingResolver.getIdentifier("identifier__name__")); + assertEquals("identifier_name_weee", namingResolver.getIdentifier("identifier-name.weee")); + assertEquals("_identifier_name_", namingResolver.getIdentifier("\"identifier name\"")); + assertEquals("identifier_name", namingResolver.getIdentifier("identifier name")); + assertEquals("identifier_", namingResolver.getIdentifier("identifier%")); + assertEquals("_identifier_", namingResolver.getIdentifier("`identifier`")); + + assertEquals("_airbyte_raw_identifier_name", namingResolver.getRawTableName("identifier_name")); + } + + // Temporarily disabling the behavior of the StandardNameTransformer, see (issue #1785) + // @Test + void testExtendedSQLNaming() { + final NamingConventionTransformer namingResolver = new StandardNameTransformer(); + assertEquals("identifier_name", namingResolver.getIdentifier("identifier_name")); + assertEquals("iDenTiFieR_name", namingResolver.getIdentifier("iDenTiFieR_name")); + assertEquals("__identifier_name", namingResolver.getIdentifier("__identifier_name")); + assertEquals("IDENTIFIER_NAME", namingResolver.getIdentifier("IDENTIFIER_NAME")); + assertEquals("\"123identifier_name\"", namingResolver.getIdentifier("123identifier_name")); + assertEquals("i0d0e0n0t0i0f0i0e0r0n0a0m0e", namingResolver.getIdentifier("i0d0e0n0t0i0f0i0e0r0n0a0m0e")); + assertEquals("\",identifier+name\"", namingResolver.getIdentifier(",identifier+name")); + assertEquals("\"identifiêr name\"", namingResolver.getIdentifier("identifiêr name")); + assertEquals("\"a_unicode_name_文\"", namingResolver.getIdentifier("a_unicode_name_文")); + assertEquals("identifier__name__", namingResolver.getIdentifier("identifier__name__")); + assertEquals("\"identifier-name.weee\"", namingResolver.getIdentifier("identifier-name.weee")); + assertEquals("\"\"identifier name\"\"", namingResolver.getIdentifier("\"identifier name\"")); + assertEquals("\"identifier name\"", namingResolver.getIdentifier("identifier name")); + assertEquals("\"identifier%\"", namingResolver.getIdentifier("identifier%")); + assertEquals("\"`identifier`\"", namingResolver.getIdentifier("`identifier`")); + + assertEquals("_airbyte_raw_identifier_name", namingResolver.getRawTableName("identifier_name")); + assertEquals("\"_airbyte_raw_identifiêr name\"", namingResolver.getRawTableName("identifiêr name")); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/normalization/NormalizationLogParserTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/normalization/NormalizationLogParserTest.java new file mode 100644 index 0000000000000..44c9dc74f5856 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/normalization/NormalizationLogParserTest.java @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.normalization; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import io.airbyte.integrations.destination.normalization.NormalizationLogParser; +import io.airbyte.protocol.models.AirbyteErrorTraceMessage; +import io.airbyte.protocol.models.AirbyteErrorTraceMessage.FailureType; +import io.airbyte.protocol.models.AirbyteLogMessage; +import io.airbyte.protocol.models.AirbyteLogMessage.Level; +import io.airbyte.protocol.models.AirbyteMessage; +import io.airbyte.protocol.models.AirbyteMessage.Type; +import io.airbyte.protocol.models.AirbyteTraceMessage; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.List; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class NormalizationLogParserTest { + + private NormalizationLogParser parser; + + @BeforeEach + void setup() { + parser = new NormalizationLogParser(); + } + + @Test + void testWrapNonJsonLogs() { + runTest( + """ + foo + bar + [error] oh no + asdf + [error] qwer + """, + List.of( + logMessage(Level.INFO, "foo"), + logMessage(Level.INFO, "bar"), + logMessage(Level.INFO, "[error] oh no"), + logMessage(Level.INFO, "asdf"), + logMessage(Level.INFO, "[error] qwer")), + List.of( + "[error] oh no", + "[error] qwer")); + } + + @Test + void testWrapJsonLogs() { + runTest( + """ + {"code": "A001", "data": {"v": "=1.0.9"}, "invocation_id": "ed2017da-965d-406b-8fa1-07fb7c19fd14", "level": "info", "log_version": 1, "msg": "Running with dbt=1.0.9", "node_info": {}, "pid": 55, "thread_name": "MainThread", "ts": "2023-04-11T16:08:54.781886Z", "type": "log_line"} + {"code": "A001", "data": {"v": "=1.0.9"}, "invocation_id": "ed2017da-965d-406b-8fa1-07fb7c19fd14", "level": "error", "log_version": 1, "msg": "oh no", "node_info": {}, "pid": 55, "thread_name": "MainThread", "ts": "2023-04-11T16:08:54.781886Z", "type": "log_line"} + {"type": "TRACE", "trace": {"type": "ERROR", "emitted_at": 1.681766805198E12, "error": {"failure_type": "system_error", "message": "uh oh", "stack_trace": "normalization blew up", "internal_message": "normalization blew up with more detail"}}} + """, + List.of( + logMessage(Level.INFO, "Running with dbt=1.0.9"), + logMessage(Level.ERROR, "oh no"), + new AirbyteMessage() + .withType(Type.TRACE) + .withTrace(new AirbyteTraceMessage() + .withType(AirbyteTraceMessage.Type.ERROR) + .withEmittedAt(1.681766805198E12) + .withError(new AirbyteErrorTraceMessage() + .withFailureType(FailureType.SYSTEM_ERROR) + .withMessage("uh oh") + .withStackTrace("normalization blew up") + .withInternalMessage("normalization blew up with more detail")))), + List.of( + "oh no")); + } + + @Test + void testWeirdLogs() { + runTest( + """ + null + "null" + {"msg": "message with no level", "type": "log_line"} + {"level": "info", "type": "log_line"} + {"level": "error", "type": "log_line"} + """, + List.of( + logMessage(Level.INFO, "null"), + logMessage(Level.INFO, "\"null\""), + logMessage(Level.INFO, "{\n \"msg\" : \"message with no level\",\n \"type\" : \"log_line\"\n}"), + logMessage(Level.INFO, ""), + logMessage(Level.ERROR, "")), + List.of( + "")); + } + + private void runTest(String rawLogs, List expectedMessages, List expectedDbtErrors) { + final List messages = parser.create(new BufferedReader( + new InputStreamReader( + new ByteArrayInputStream( + rawLogs.getBytes(StandardCharsets.UTF_8)), + StandardCharsets.UTF_8))) + .toList(); + + assertEquals( + expectedMessages, + messages); + assertEquals(expectedDbtErrors, parser.getDbtErrors()); + } + + private AirbyteMessage logMessage(Level level, String message) { + return new AirbyteMessage() + .withType(Type.LOG) + .withLog(new AirbyteLogMessage() + .withLevel(level) + .withMessage(message)); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/ssh/SshTunnelTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/ssh/SshTunnelTest.java new file mode 100644 index 0000000000000..8f5f1a003ecc8 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/base/ssh/SshTunnelTest.java @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.base.ssh; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.base.ssh.SshTunnel.TunnelMethod; +import java.nio.charset.StandardCharsets; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; +import java.util.Arrays; +import org.apache.sshd.client.SshClient; +import org.apache.sshd.client.session.ClientSession; +import org.apache.sshd.common.util.security.SecurityUtils; +import org.apache.sshd.common.util.security.eddsa.EdDSASecurityProviderRegistrar; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class SshTunnelTest { + + private static final String SSH_ED25519_PRIVATE_KEY = "-----BEGIN OPENSSH PRIVATE KEY-----\\n" + + "b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW\\n" + + "QyNTUxOQAAACDbBP+5jmEtjh1JvhzVQsvvTC2IQrX6P68XzrV7ZbnGsQAAAKBgtw9/YLcP\\n" + + "fwAAAAtzc2gtZWQyNTUxOQAAACDbBP+5jmEtjh1JvhzVQsvvTC2IQrX6P68XzrV7ZbnGsQ\\n" + + "AAAEAaKYn22N1O78HfdG22C7hcG2HiezKMzlq4JTdgYG1DstsE/7mOYS2OHUm+HNVCy+9M\\n" + + "LYhCtfo/rxfOtXtlucaxAAAAHHRmbG9yZXNfZHQwMUB0ZmxvcmVzX2R0MDEtUEMB\\n" + + "-----END OPENSSH PRIVATE KEY-----"; + private static final String SSH_RSA_PRIVATE_KEY = "-----BEGIN OPENSSH PRIVATE KEY-----\\n" + + "b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn\\n" + + "NhAAAAAwEAAQAAAYEAuFjfTMS6BrgoxaQe9i83y6CdGH3xJIwc1Wy+11ibWAFcQ6khX/x0\\n" + + "M+JnJaSCs/hxiDE4afHscP3HzVQC699IgKwyAPaG0ZG+bLhxWAm4E79P7Yssj7imhTqr0A\\n" + + "DZDO23CCOagHvfdg1svnBhk1ih14GMGKRFCS27CLgholIOeogOyH7b3Jaqy9LtICiE054e\\n" + + "jwdaZdwWU08kxMO4ItdxNasCPC5uQiaXIzWFysG0mLk7WWc8WyuQHneQFl3Qu6p/rWJz4i\\n" + + "seea5CBL5s1DIyCyo/jgN5/oOWOciPUl49mDLleCzYTDnWqX43NK9A87unNeuA95Fk9akH\\n" + + "8QH4hKBCzpHhsh4U3Ys/l9Q5NmnyBrtFWBY2n13ZftNA/Ms+Hsh6V3eyJW0rIFY2/UM4XA\\n" + + "YyD6MEOlvFAQjxC6EbqfkrC6FQgH3I2wAtIDqEk2j79vfIIDdzp8otWjIQsApX55j+kKio\\n" + + "sY8YTXb9sLWuEdpSd/AN3iQ8HwIceyTulaKn7rTBAAAFkMwDTyPMA08jAAAAB3NzaC1yc2\\n" + + "EAAAGBALhY30zEuga4KMWkHvYvN8ugnRh98SSMHNVsvtdYm1gBXEOpIV/8dDPiZyWkgrP4\\n" + + "cYgxOGnx7HD9x81UAuvfSICsMgD2htGRvmy4cVgJuBO/T+2LLI+4poU6q9AA2Qzttwgjmo\\n" + + "B733YNbL5wYZNYodeBjBikRQktuwi4IaJSDnqIDsh+29yWqsvS7SAohNOeHo8HWmXcFlNP\\n" + + "JMTDuCLXcTWrAjwubkImlyM1hcrBtJi5O1lnPFsrkB53kBZd0Luqf61ic+IrHnmuQgS+bN\\n" + + "QyMgsqP44Def6DljnIj1JePZgy5Xgs2Ew51ql+NzSvQPO7pzXrgPeRZPWpB/EB+ISgQs6R\\n" + + "4bIeFN2LP5fUOTZp8ga7RVgWNp9d2X7TQPzLPh7Ield3siVtKyBWNv1DOFwGMg+jBDpbxQ\\n" + + "EI8QuhG6n5KwuhUIB9yNsALSA6hJNo+/b3yCA3c6fKLVoyELAKV+eY/pCoqLGPGE12/bC1\\n" + + "rhHaUnfwDd4kPB8CHHsk7pWip+60wQAAAAMBAAEAAAGAXw+dHpY3o21lwP0v5h1VNVD+kX\\n" + + "moVwNVfw0ToDKV8JzK+i0GA9xIA9VVAUlDCREtYmCXSbKyDVYgqRYQZ5d9aLTjGDIINZtl\\n" + + "SeUWtaJVZQF7cvAYq4g5fmxR2vIE+zC9+Jl7e5PlGJg1okKLXpMO6fVoy/AxlVkaoJVq6q\\n" + + "xLwQ3WKbeZIrgjHPYIx1N9oy5fbbwJ9oq2jIE8YabXlkfonhcwEN6UhtIlj8dy1apruXGT\\n" + + "VDfzHMRrDfrzt0TrdUqmqgo/istP89sggtkJ8uuPtkBFHTjao8MiBsshy1iDVbIno9gDbJ\\n" + + "JgYyunmSgEjEZpp09+mkgwfZO3/RDLRPF1SRAGBNy27CH8/bh9gAVRhAPi0GLclNi292Ya\\n" + + "NrGvjMcRlYAsWL3mZ9aTbv0j7Qi8qdWth+rZ+tBmNToUVVl5iLxifgo0kjiXAehZB1LaQV\\n" + + "yuMXlXOGmt9V2/DPACA9getQJQONxrLAcgHdjMiuwD8r7d+m/kE4+cOTakOlzrfrwBAAAA\\n" + + "wQCVTQTvuyBW3JemMPtRLifQqdwMGRPokm5nTn+JSJQvg+dNpL7hC0k3IesKs63gxuuHoq\\n" + + "4q1xkMmCMvihT8oVlxrezEjsO/QMCxe6Sr9eMfHAjrdPeHsPaf9oOgG9vEEH9dEilHpnlb\\n" + + "97Vyl9EHm1iahONM1gWdXkPjIfnQzYPvSLZPtBBSI0XBjCTifMnCRgd3s2bdm7kh+7XA+C\\n" + + "rX62WfPIJKL+OhMIf+ED4HBJTd/vU34Vk73yvqHzqel0ZQnRoAAADBAOGSm6TNBptV7S5P\\n" + + "wT3BhGYSm35/7nCFTilyfy5/8EUmifUFittRIvgDoXBWeZEwvqIiQ55iX9mYmNmb0KbPCw\\n" + + "cqN/BtXWItAvyTDZ6PeI2m2aUj+rW2R3ZXEsBjgaNRtbPyMKQ69xtKRvHtNZNfgjpRQ4is\\n" + + "lbufhAK1YbUxrlfKaBGOcGyR7DNmUUUN6nptQbpOr1HQc5DOH17HIDnRPs44HIws3/apww\\n" + + "RBIjjy6GQNfJ/Ge8N4pxGoLl1qKO8xoQAAAMEA0Tat/E5mSsgjCgmFja/jOZJcrzZHwrPT\\n" + + "3NEbuAMQ/L3atKEINypmpJfjIAvNljKJwSUDMEWvs8qj8cSGCrtkcAv1YSm697TL2oC9HU\\n" + + "CFoOJAkH1X2CGTgHlR9it3j4aRJ3dXdL2k7aeoGXObfRWqBNPj0LOOZs64RA6scGAzo6MR\\n" + + "5WlcOxfV1wZuaM0fOd+PBmIlFEE7Uf6AY/UahBAxaFV2+twgK9GCDcu1t4Ye9wZ9kZ4Nal\\n" + + "0fkKD4uN4DRO8hAAAAFm10dWhhaUBrYnAxLWxocC1hMTQ1MzMBAgME\\n" + + "-----END OPENSSH PRIVATE KEY-----"; + private static final String HOST_PORT_CONFIG = + "{\"ssl\":true,\"host\":\"fakehost.com\",\"port\":5432,\"schema\":\"public\",\"database\":\"postgres\",\"password\":\"\",\"username\":\"postgres\",\"tunnel_method\":{\"ssh_key\":\"" + + "%s" + + "\",\"tunnel_host\":\"faketunnel.com\",\"tunnel_port\":22,\"tunnel_user\":\"ec2-user\",\"tunnel_method\":\"SSH_KEY_AUTH\"}}"; + + private static final String URL_CONFIG_WITH_PORT = + "{\"ssl\":true,\"endpoint\":\"http://fakehost.com:9090/service\",\"password\":\"\",\"username\":\"restuser\",\"tunnel_method\":{\"ssh_key\":\"" + + "%s" + + "\",\"tunnel_host\":\"faketunnel.com\",\"tunnel_port\":22,\"tunnel_user\":\"ec2-user\",\"tunnel_method\":\"SSH_KEY_AUTH\"}}"; + + private static final String URL_CONFIG_NO_PORT = + "{\"ssl\":true,\"endpoint\":\"http://fakehost.com/service\",\"password\":\"\",\"username\":\"restuser\",\"tunnel_method\":{\"ssh_key\":\"" + + "%s" + + "\",\"tunnel_host\":\"faketunnel.com\",\"tunnel_port\":22,\"tunnel_user\":\"ec2-user\",\"tunnel_method\":\"SSH_KEY_AUTH\"}}"; + + /** + * This test verifies that OpenSsh correctly replaces values in connector configuration in a spec + * with host/port config and in a spec with endpoint URL config + * + * @param configString + * @throws Exception + */ + @ParameterizedTest + @ValueSource(strings = {HOST_PORT_CONFIG, URL_CONFIG_WITH_PORT, URL_CONFIG_NO_PORT}) + public void testConfigInTunnel(final String configString) throws Exception { + final JsonNode config = (new ObjectMapper()).readTree(String.format(configString, SSH_RSA_PRIVATE_KEY)); + String endPointURL = Jsons.getStringOrNull(config, "endpoint"); + final SshTunnel sshTunnel = new SshTunnel( + config, + endPointURL == null ? Arrays.asList(new String[] {"host"}) : null, + endPointURL == null ? Arrays.asList(new String[] {"port"}) : null, + endPointURL == null ? null : "endpoint", + endPointURL, + TunnelMethod.SSH_KEY_AUTH, + "faketunnel.com", + 22, + "tunnelUser", + SSH_RSA_PRIVATE_KEY, + "tunnelUserPassword", + endPointURL == null ? "fakeHost.com" : null, + endPointURL == null ? 5432 : 0) { + + @Override + ClientSession openTunnel(final SshClient client) { + tunnelLocalPort = 8080; + return null; // Prevent tunnel from attempting to connect + } + + }; + + final JsonNode configInTunnel = sshTunnel.getConfigInTunnel(); + if (endPointURL == null) { + assertTrue(configInTunnel.has("port")); + assertTrue(configInTunnel.has("host")); + assertFalse(configInTunnel.has("endpoint")); + assertEquals(8080, configInTunnel.get("port").asInt()); + assertEquals("127.0.0.1", configInTunnel.get("host").asText()); + } else { + assertFalse(configInTunnel.has("port")); + assertFalse(configInTunnel.has("host")); + assertTrue(configInTunnel.has("endpoint")); + assertEquals("http://127.0.0.1:8080/service", configInTunnel.get("endpoint").asText()); + } + } + + /** + * This test verifies that SshTunnel correctly extracts private key pairs from keys formatted as + * EdDSA and OpenSSH + * + * @param privateKey + * @throws Exception + */ + @ParameterizedTest + @ValueSource(strings = {SSH_ED25519_PRIVATE_KEY, SSH_RSA_PRIVATE_KEY}) + public void getKeyPair(final String privateKey) throws Exception { + final JsonNode config = (new ObjectMapper()).readTree(String.format(HOST_PORT_CONFIG, privateKey)); + final SshTunnel sshTunnel = new SshTunnel( + config, + Arrays.asList(new String[] {"host"}), + Arrays.asList(new String[] {"port"}), + null, + null, + TunnelMethod.SSH_KEY_AUTH, + "faketunnel.com", + 22, + "tunnelUser", + privateKey, + "tunnelUserPassword", + "fakeHost.com", + 5432) { + + @Override + ClientSession openTunnel(final SshClient client) { + return null; // Prevent tunnel from attempting to connect + } + + }; + + final KeyPair authKeyPair = sshTunnel.getPrivateKeyPair(); + assertNotNull(authKeyPair);// actually, all is good if there is no exception on previous line + } + + /** + * This test verifies that 'net.i2p.crypto:eddsa' is present and EdDSA is supported. If + * net.i2p.crypto:eddsa will be removed from project, then will be thrown: generator not correctly + * initialized + * + * @throws Exception + */ + @Test + public void edDsaIsSupported() throws Exception { + final var keygen = SecurityUtils.getKeyPairGenerator("EdDSA"); + final String message = "hello world"; + final KeyPair keyPair = keygen.generateKeyPair(); + + final byte[] signedMessage = sign(keyPair.getPrivate(), message); + + assertTrue(new EdDSASecurityProviderRegistrar().isSupported()); + assertTrue(verify(keyPair.getPublic(), signedMessage, message)); + } + + private byte[] sign(final PrivateKey privateKey, final String message) throws Exception { + final var signature = SecurityUtils.getSignature("NONEwithEdDSA"); + signature.initSign(privateKey); + + signature.update(message.getBytes(StandardCharsets.UTF_8)); + + return signature.sign(); + } + + private boolean verify(final PublicKey publicKey, final byte[] signed, final String message) + throws Exception { + final var signature = SecurityUtils.getSignature("NONEwithEdDSA"); + signature.initVerify(publicKey); + + signature.update(message.getBytes(StandardCharsets.UTF_8)); + + return signature.verify(signed); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumerTest.java new file mode 100644 index 0000000000000..11ef6402e14ec --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/buffered_stream_consumer/BufferedStreamConsumerTest.java @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; +import static org.mockito.Mockito.when; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import io.airbyte.commons.functional.CheckedFunction; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.destination.record_buffer.InMemoryRecordBufferingStrategy; +import io.airbyte.protocol.models.Field; +import io.airbyte.protocol.models.JsonSchemaType; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStateMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.CatalogHelpers; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.time.Duration; +import java.time.Instant; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.commons.lang.RandomStringUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class BufferedStreamConsumerTest { + + private static final String SCHEMA_NAME = "public"; + private static final String STREAM_NAME = "id_and_name"; + private static final String STREAM_NAME2 = STREAM_NAME + 2; + private static final int PERIODIC_BUFFER_FREQUENCY = 5; + private static final ConfiguredAirbyteCatalog CATALOG = new ConfiguredAirbyteCatalog().withStreams(List.of( + CatalogHelpers.createConfiguredAirbyteStream( + STREAM_NAME, + SCHEMA_NAME, + Field.of("id", JsonSchemaType.NUMBER), + Field.of("name", JsonSchemaType.STRING)), + CatalogHelpers.createConfiguredAirbyteStream( + STREAM_NAME2, + SCHEMA_NAME, + Field.of("id", JsonSchemaType.NUMBER), + Field.of("name", JsonSchemaType.STRING)))); + + private static final AirbyteMessage STATE_MESSAGE1 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage().withData(Jsons.jsonNode(ImmutableMap.of("state_message_id", 1)))); + private static final AirbyteMessage STATE_MESSAGE2 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage().withData(Jsons.jsonNode(ImmutableMap.of("state_message_id", 2)))); + + private BufferedStreamConsumer consumer; + private OnStartFunction onStart; + private RecordWriter recordWriter; + private OnCloseFunction onClose; + private CheckedFunction isValidRecord; + private Consumer outputRecordCollector; + + @SuppressWarnings("unchecked") + @BeforeEach + void setup() throws Exception { + onStart = mock(OnStartFunction.class); + recordWriter = mock(RecordWriter.class); + onClose = mock(OnCloseFunction.class); + isValidRecord = mock(CheckedFunction.class); + outputRecordCollector = mock(Consumer.class); + consumer = new BufferedStreamConsumer( + outputRecordCollector, + onStart, + new InMemoryRecordBufferingStrategy(recordWriter, 1_000), + onClose, + CATALOG, + isValidRecord); + + when(isValidRecord.apply(any())).thenReturn(true); + } + + @Test + void test1StreamWith1State() throws Exception { + final List expectedRecords = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecords); + consumer.accept(STATE_MESSAGE1); + consumer.close(); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecords); + + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void test1StreamWith2State() throws Exception { + final List expectedRecords = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecords); + consumer.accept(STATE_MESSAGE1); + consumer.accept(STATE_MESSAGE2); + consumer.close(); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecords); + + verify(outputRecordCollector, times(1)).accept(STATE_MESSAGE2); + } + + @Test + void test1StreamWith0State() throws Exception { + final List expectedRecords = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecords); + consumer.close(); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecords); + } + + @Test + void test1StreamWithStateAndThenMoreRecordsBiggerThanBuffer() throws Exception { + final List expectedRecordsBatch1 = generateRecords(1_000); + final List expectedRecordsBatch2 = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecordsBatch1); + consumer.accept(STATE_MESSAGE1); + consumeRecords(consumer, expectedRecordsBatch2); + consumer.close(); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsBatch1); + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsBatch2); + + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void test1StreamWithStateAndThenMoreRecordsSmallerThanBuffer() throws Exception { + final List expectedRecordsBatch1 = generateRecords(1_000); + final List expectedRecordsBatch2 = generateRecords(1_000); + + // consumer with big enough buffered that we see both batches are flushed in one go. + final BufferedStreamConsumer consumer = new BufferedStreamConsumer( + outputRecordCollector, + onStart, + new InMemoryRecordBufferingStrategy(recordWriter, 10_000), + onClose, + CATALOG, + isValidRecord); + + consumer.start(); + consumeRecords(consumer, expectedRecordsBatch1); + consumer.accept(STATE_MESSAGE1); + consumeRecords(consumer, expectedRecordsBatch2); + consumer.close(); + + verifyStartAndClose(); + + final List expectedRecords = Lists.newArrayList(expectedRecordsBatch1, expectedRecordsBatch2) + .stream() + .flatMap(Collection::stream) + .collect(Collectors.toList()); + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecords); + + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void testExceptionAfterOneStateMessage() throws Exception { + final List expectedRecordsBatch1 = generateRecords(1_000); + final List expectedRecordsBatch2 = generateRecords(1_000); + final List expectedRecordsBatch3 = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecordsBatch1); + consumer.accept(STATE_MESSAGE1); + consumeRecords(consumer, expectedRecordsBatch2); + when(isValidRecord.apply(any())).thenThrow(new IllegalStateException("induced exception")); + assertThrows(IllegalStateException.class, () -> consumer.accept(expectedRecordsBatch3.get(0))); + consumer.close(); + + verifyStartAndCloseFailure(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsBatch1); + + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void testExceptionAfterNoStateMessages() throws Exception { + final List expectedRecordsBatch1 = generateRecords(1_000); + final List expectedRecordsBatch2 = generateRecords(1_000); + final List expectedRecordsBatch3 = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecordsBatch1); + consumeRecords(consumer, expectedRecordsBatch2); + when(isValidRecord.apply(any())).thenThrow(new IllegalStateException("induced exception")); + assertThrows(IllegalStateException.class, () -> consumer.accept(expectedRecordsBatch3.get(0))); + consumer.close(); + + verifyStartAndCloseFailure(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsBatch1); + + verifyNoInteractions(outputRecordCollector); + } + + @Test + void testExceptionDuringOnClose() throws Exception { + doThrow(new IllegalStateException("induced exception")).when(onClose).accept(false); + + final List expectedRecordsBatch1 = generateRecords(1_000); + final List expectedRecordsBatch2 = generateRecords(1_000); + + consumer.start(); + consumeRecords(consumer, expectedRecordsBatch1); + consumer.accept(STATE_MESSAGE1); + consumeRecords(consumer, expectedRecordsBatch2); + assertThrows(IllegalStateException.class, () -> consumer.close(), "Expected an error to be thrown on close"); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsBatch1); + + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void test2StreamWith1State() throws Exception { + final List expectedRecordsStream1 = generateRecords(1_000); + final List expectedRecordsStream2 = expectedRecordsStream1 + .stream() + .map(Jsons::clone) + .peek(m -> m.getRecord().withStream(STREAM_NAME2)) + .collect(Collectors.toList()); + + consumer.start(); + consumeRecords(consumer, expectedRecordsStream1); + consumer.accept(STATE_MESSAGE1); + consumeRecords(consumer, expectedRecordsStream2); + consumer.close(); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsStream1); + verifyRecords(STREAM_NAME2, SCHEMA_NAME, expectedRecordsStream2); + + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void test2StreamWith2State() throws Exception { + final List expectedRecordsStream1 = generateRecords(1_000); + final List expectedRecordsStream2 = expectedRecordsStream1 + .stream() + .map(Jsons::clone) + .peek(m -> m.getRecord().withStream(STREAM_NAME2)) + .collect(Collectors.toList()); + + consumer.start(); + consumeRecords(consumer, expectedRecordsStream1); + consumer.accept(STATE_MESSAGE1); + consumeRecords(consumer, expectedRecordsStream2); + consumer.accept(STATE_MESSAGE2); + consumer.close(); + + verifyStartAndClose(); + + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsStream1); + verifyRecords(STREAM_NAME2, SCHEMA_NAME, expectedRecordsStream2); + + verify(outputRecordCollector, times(1)).accept(STATE_MESSAGE2); + } + + // Periodic Buffer Flush Tests + @Test + void testSlowStreamReturnsState() throws Exception { + // generate records less than the default maxQueueSizeInBytes to confirm periodic flushing occurs + final List expectedRecordsStream1 = generateRecords(500L); + final List expectedRecordsStream1Batch2 = generateRecords(200L); + + // Overrides flush frequency for testing purposes to 5 seconds + final BufferedStreamConsumer flushConsumer = getConsumerWithFlushFrequency(); + flushConsumer.start(); + consumeRecords(flushConsumer, expectedRecordsStream1); + flushConsumer.accept(STATE_MESSAGE1); + // NOTE: Sleeps process for 5 seconds, if tests are slow this can be updated to reduce slowdowns + TimeUnit.SECONDS.sleep(PERIODIC_BUFFER_FREQUENCY); + consumeRecords(flushConsumer, expectedRecordsStream1Batch2); + flushConsumer.close(); + + verifyStartAndClose(); + // expects the records to be grouped because periodicBufferFlush occurs at the end of acceptTracked + verifyRecords(STREAM_NAME, SCHEMA_NAME, + Stream.concat(expectedRecordsStream1.stream(), expectedRecordsStream1Batch2.stream()).collect(Collectors.toList())); + verify(outputRecordCollector).accept(STATE_MESSAGE1); + } + + @Test + void testSlowStreamReturnsMultipleStates() throws Exception { + // generate records less than the default maxQueueSizeInBytes to confirm periodic flushing occurs + final List expectedRecordsStream1 = generateRecords(500L); + final List expectedRecordsStream1Batch2 = generateRecords(200L); + // creates records equal to size that triggers buffer flush + final List expectedRecordsStream1Batch3 = generateRecords(1_000L); + + // Overrides flush frequency for testing purposes to 5 seconds + final BufferedStreamConsumer flushConsumer = getConsumerWithFlushFrequency(); + flushConsumer.start(); + consumeRecords(flushConsumer, expectedRecordsStream1); + flushConsumer.accept(STATE_MESSAGE1); + // NOTE: Sleeps process for 5 seconds, if tests are slow this can be updated to reduce slowdowns + TimeUnit.SECONDS.sleep(PERIODIC_BUFFER_FREQUENCY); + consumeRecords(flushConsumer, expectedRecordsStream1Batch2); + consumeRecords(flushConsumer, expectedRecordsStream1Batch3); + flushConsumer.accept(STATE_MESSAGE2); + flushConsumer.close(); + + verifyStartAndClose(); + // expects the records to be grouped because periodicBufferFlush occurs at the end of acceptTracked + verifyRecords(STREAM_NAME, SCHEMA_NAME, + Stream.concat(expectedRecordsStream1.stream(), expectedRecordsStream1Batch2.stream()).collect(Collectors.toList())); + verifyRecords(STREAM_NAME, SCHEMA_NAME, expectedRecordsStream1Batch3); + // expects two STATE messages returned since one will be flushed after periodic flushing occurs + // and the other after buffer has been filled + verify(outputRecordCollector).accept(STATE_MESSAGE1); + verify(outputRecordCollector).accept(STATE_MESSAGE2); + } + + private BufferedStreamConsumer getConsumerWithFlushFrequency() { + final BufferedStreamConsumer flushFrequencyConsumer = new BufferedStreamConsumer( + outputRecordCollector, + onStart, + new InMemoryRecordBufferingStrategy(recordWriter, 10_000), + onClose, + CATALOG, + isValidRecord, + Duration.ofSeconds(PERIODIC_BUFFER_FREQUENCY)); + return flushFrequencyConsumer; + } + + private void verifyStartAndClose() throws Exception { + verify(onStart).call(); + verify(onClose).accept(false); + } + + /** Indicates that a failure occurred while consuming AirbyteMessages */ + private void verifyStartAndCloseFailure() throws Exception { + verify(onStart).call(); + verify(onClose).accept(true); + } + + private static void consumeRecords(final BufferedStreamConsumer consumer, final Collection records) { + records.forEach(m -> { + try { + consumer.accept(m); + } catch (final Exception e) { + throw new RuntimeException(e); + } + }); + } + + // NOTE: Generates records at chunks of 160 bytes + private static List generateRecords(final long targetSizeInBytes) { + final List output = Lists.newArrayList(); + long bytesCounter = 0; + for (int i = 0;; i++) { + final JsonNode payload = + Jsons.jsonNode(ImmutableMap.of("id", RandomStringUtils.randomAlphabetic(7), "name", "human " + String.format("%8d", i))); + final long sizeInBytes = RecordSizeEstimator.getStringByteSize(payload); + bytesCounter += sizeInBytes; + final AirbyteMessage airbyteMessage = new AirbyteMessage() + .withType(Type.RECORD) + .withRecord(new AirbyteRecordMessage() + .withStream(STREAM_NAME) + .withNamespace(SCHEMA_NAME) + .withEmittedAt(Instant.now().toEpochMilli()) + .withData(payload)); + if (bytesCounter > targetSizeInBytes) { + break; + } else { + output.add(airbyteMessage); + } + } + return output; + } + + private void verifyRecords(final String streamName, final String namespace, final Collection expectedRecords) throws Exception { + verify(recordWriter).accept( + new AirbyteStreamNameNamespacePair(streamName, namespace), + expectedRecords.stream().map(AirbyteMessage::getRecord).collect(Collectors.toList())); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordSizeEstimatorTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordSizeEstimatorTest.java new file mode 100644 index 0000000000000..478398d12aa13 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/buffered_stream_consumer/RecordSizeEstimatorTest.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.buffered_stream_consumer; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import org.junit.jupiter.api.Test; + +class RecordSizeEstimatorTest { + + private static final JsonNode DATA_0 = Jsons.deserialize("{}"); + private static final JsonNode DATA_1 = Jsons.deserialize("{ \"field1\": true }"); + private static final JsonNode DATA_2 = Jsons.deserialize("{ \"field1\": 10000 }"); + private static final long DATA_0_SIZE = RecordSizeEstimator.getStringByteSize(DATA_0); + private static final long DATA_1_SIZE = RecordSizeEstimator.getStringByteSize(DATA_1); + private static final long DATA_2_SIZE = RecordSizeEstimator.getStringByteSize(DATA_2); + + @Test + public void testPeriodicSampling() { + // the estimate performs a size sampling every 3 records + final RecordSizeEstimator sizeEstimator = new RecordSizeEstimator(3); + final String stream = "stream"; + final AirbyteRecordMessage record0 = new AirbyteRecordMessage().withStream(stream).withData(DATA_0); + final AirbyteRecordMessage record1 = new AirbyteRecordMessage().withStream(stream).withData(DATA_1); + final AirbyteRecordMessage record2 = new AirbyteRecordMessage().withStream(stream).withData(DATA_2); + + // sample record message 1 + final long firstEstimation = DATA_1_SIZE; + assertEquals(firstEstimation, sizeEstimator.getEstimatedByteSize(record1)); + // next two calls return the first sampling result + assertEquals(firstEstimation, sizeEstimator.getEstimatedByteSize(record0)); + assertEquals(firstEstimation, sizeEstimator.getEstimatedByteSize(record0)); + + // sample record message 2 + final long secondEstimation = firstEstimation / 2 + DATA_2_SIZE / 2; + assertEquals(secondEstimation, sizeEstimator.getEstimatedByteSize(record2)); + // next two calls return the second sampling result + assertEquals(secondEstimation, sizeEstimator.getEstimatedByteSize(record0)); + assertEquals(secondEstimation, sizeEstimator.getEstimatedByteSize(record0)); + + // sample record message 1 + final long thirdEstimation = secondEstimation / 2 + DATA_1_SIZE / 2; + assertEquals(thirdEstimation, sizeEstimator.getEstimatedByteSize(record1)); + // next two calls return the first sampling result + assertEquals(thirdEstimation, sizeEstimator.getEstimatedByteSize(record0)); + assertEquals(thirdEstimation, sizeEstimator.getEstimatedByteSize(record0)); + } + + @Test + public void testDifferentEstimationPerStream() { + final RecordSizeEstimator sizeEstimator = new RecordSizeEstimator(); + final AirbyteRecordMessage record0 = new AirbyteRecordMessage().withStream("stream1").withData(DATA_0); + final AirbyteRecordMessage record1 = new AirbyteRecordMessage().withStream("stream2").withData(DATA_1); + final AirbyteRecordMessage record2 = new AirbyteRecordMessage().withStream("stream3").withData(DATA_2); + assertEquals(DATA_0_SIZE, sizeEstimator.getEstimatedByteSize(record0)); + assertEquals(DATA_1_SIZE, sizeEstimator.getEstimatedByteSize(record1)); + assertEquals(DATA_2_SIZE, sizeEstimator.getEstimatedByteSize(record2)); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DefaultDestStateLifecycleManagerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DefaultDestStateLifecycleManagerTest.java new file mode 100644 index 0000000000000..afa85a50ae785 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DefaultDestStateLifecycleManagerTest.java @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; + +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteStateMessage; +import io.airbyte.protocol.models.v0.AirbyteStateMessage.AirbyteStateType; +import io.airbyte.protocol.models.v0.AirbyteStreamState; +import io.airbyte.protocol.models.v0.StreamDescriptor; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class DefaultDestStateLifecycleManagerTest { + + private static final AirbyteMessage UNSET_TYPE_MESSAGE = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage()); + private static final AirbyteMessage LEGACY_MESSAGE = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage().withType(AirbyteStateType.LEGACY)); + private static final AirbyteMessage GLOBAL_MESSAGE = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage().withType(AirbyteStateType.GLOBAL)); + private static final AirbyteMessage STREAM_MESSAGE = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage() + .withType(AirbyteStateType.STREAM) + .withStream(new AirbyteStreamState().withStreamDescriptor(new StreamDescriptor().withName("users")))); + + private DestStateLifecycleManager mgr1; + private DestStateLifecycleManager singleStateMgr; + private DestStateLifecycleManager streamMgr; + + @BeforeEach + void setup() { + singleStateMgr = mock(DestStateLifecycleManager.class); + streamMgr = mock(DestStateLifecycleManager.class); + mgr1 = new DefaultDestStateLifecycleManager(singleStateMgr, streamMgr); + } + + @Test + void testFailsOnIncompatibleStates() { + final DefaultDestStateLifecycleManager manager1 = new DefaultDestStateLifecycleManager(singleStateMgr, streamMgr); + manager1.addState(UNSET_TYPE_MESSAGE); + manager1.addState(UNSET_TYPE_MESSAGE); + manager1.addState(LEGACY_MESSAGE); + assertThrows(IllegalArgumentException.class, () -> manager1.addState(GLOBAL_MESSAGE)); + assertThrows(IllegalArgumentException.class, () -> manager1.addState(STREAM_MESSAGE)); + + final DefaultDestStateLifecycleManager manager2 = new DefaultDestStateLifecycleManager(singleStateMgr, streamMgr); + manager2.addState(LEGACY_MESSAGE); + manager2.addState(LEGACY_MESSAGE); + manager2.addState(UNSET_TYPE_MESSAGE); + assertThrows(IllegalArgumentException.class, () -> manager2.addState(GLOBAL_MESSAGE)); + assertThrows(IllegalArgumentException.class, () -> manager2.addState(STREAM_MESSAGE)); + + final DefaultDestStateLifecycleManager manager3 = new DefaultDestStateLifecycleManager(singleStateMgr, streamMgr); + manager3.addState(GLOBAL_MESSAGE); + manager3.addState(GLOBAL_MESSAGE); + assertThrows(IllegalArgumentException.class, () -> manager3.addState(UNSET_TYPE_MESSAGE)); + assertThrows(IllegalArgumentException.class, () -> manager3.addState(LEGACY_MESSAGE)); + assertThrows(IllegalArgumentException.class, () -> manager3.addState(STREAM_MESSAGE)); + + final DefaultDestStateLifecycleManager manager4 = new DefaultDestStateLifecycleManager(singleStateMgr, streamMgr); + manager4.addState(STREAM_MESSAGE); + manager4.addState(STREAM_MESSAGE); + assertThrows(IllegalArgumentException.class, () -> manager4.addState(UNSET_TYPE_MESSAGE)); + assertThrows(IllegalArgumentException.class, () -> manager4.addState(LEGACY_MESSAGE)); + assertThrows(IllegalArgumentException.class, () -> manager4.addState(GLOBAL_MESSAGE)); + } + + @Test + void testDelegatesLegacyMessages() { + mgr1.addState(UNSET_TYPE_MESSAGE); + mgr1.addState(LEGACY_MESSAGE); + mgr1.markPendingAsFlushed(); + mgr1.markFlushedAsCommitted(); + mgr1.listFlushed(); + mgr1.listCommitted(); + verify(singleStateMgr).addState(UNSET_TYPE_MESSAGE); + verify(singleStateMgr).addState(LEGACY_MESSAGE); + verify(singleStateMgr).markPendingAsFlushed(); + verify(singleStateMgr).markFlushedAsCommitted(); + verify(singleStateMgr).listFlushed(); + verify(singleStateMgr).listCommitted(); + } + + @Test + void testDelegatesGlobalMessages() { + mgr1.addState(GLOBAL_MESSAGE); + mgr1.markPendingAsFlushed(); + mgr1.markFlushedAsCommitted(); + mgr1.listFlushed(); + mgr1.listCommitted(); + verify(singleStateMgr).addState(GLOBAL_MESSAGE); + verify(singleStateMgr).markPendingAsFlushed(); + verify(singleStateMgr).markFlushedAsCommitted(); + verify(singleStateMgr).listFlushed(); + verify(singleStateMgr).listCommitted(); + } + + @Test + void testDelegatesStreamMessages() { + mgr1.addState(STREAM_MESSAGE); + mgr1.markPendingAsFlushed(); + mgr1.markFlushedAsCommitted(); + mgr1.listFlushed(); + mgr1.listCommitted(); + + verify(streamMgr).addState(STREAM_MESSAGE); + verify(streamMgr).markPendingAsFlushed(); + verify(streamMgr).markFlushedAsCommitted(); + verify(streamMgr).listFlushed(); + verify(streamMgr).listCommitted(); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestSingleStateLifecycleManagerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestSingleStateLifecycleManagerTest.java new file mode 100644 index 0000000000000..c70b415cdcc3c --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestSingleStateLifecycleManagerTest.java @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteStateMessage; +import io.airbyte.protocol.models.v0.AirbyteStateMessage.AirbyteStateType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class DestSingleStateLifecycleManagerTest { + + private static final AirbyteMessage MESSAGE1 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage().withType(AirbyteStateType.GLOBAL).withData(Jsons.jsonNode("a"))); + private static final AirbyteMessage MESSAGE2 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage().withType(AirbyteStateType.GLOBAL).withData(Jsons.jsonNode("b"))); + + private DestSingleStateLifecycleManager mgr; + + @BeforeEach + void setup() { + mgr = new DestSingleStateLifecycleManager(); + } + + /** + * Demonstrates expected lifecycle of a state object for documentation purposes. Subsequent test get + * into the details. + */ + @Test + void testBasicLifeCycle() { + // starts with no state. + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertTrue(mgr.listCommitted().isEmpty()); + + mgr.addState(MESSAGE1); + // new state supersedes previous ones. we should only see MESSAGE2 from here on out. + mgr.addState(MESSAGE2); + + // after adding a state, it is in pending only. + assertEquals(MESSAGE2, mgr.listPending().poll()); + assertTrue(mgr.listFlushed().isEmpty()); + assertTrue(mgr.listCommitted().isEmpty()); + + mgr.markPendingAsFlushed(); + + // after flushing the state it is in flushed only. + assertTrue(mgr.listPending().isEmpty()); + assertEquals(MESSAGE2, mgr.listFlushed().poll()); + assertTrue(mgr.listCommitted().isEmpty()); + + // after committing the state it is in committed only. + mgr.markFlushedAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(MESSAGE2, mgr.listCommitted().poll()); + } + + @Test + void testPending() { + mgr.addState(MESSAGE1); + mgr.addState(MESSAGE2); + + // verify the LAST message is returned. + assertEquals(MESSAGE2, mgr.listPending().poll()); + assertTrue(mgr.listFlushed().isEmpty()); + assertTrue(mgr.listCommitted().isEmpty()); + } + + @Test + void testFlushed() { + mgr.addState(MESSAGE1); + mgr.addState(MESSAGE2); + mgr.markPendingAsFlushed(); + + assertTrue(mgr.listPending().isEmpty()); + assertEquals(MESSAGE2, mgr.listFlushed().poll()); + assertTrue(mgr.listCommitted().isEmpty()); + + // verify that multiple calls to markPendingAsFlushed overwrite old states + mgr.addState(MESSAGE1); + mgr.markPendingAsFlushed(); + mgr.markPendingAsFlushed(); + + assertTrue(mgr.listPending().isEmpty()); + assertEquals(MESSAGE1, mgr.listFlushed().poll()); + assertTrue(mgr.listCommitted().isEmpty()); + } + + @Test + void testCommitted() { + mgr.addState(MESSAGE1); + mgr.addState(MESSAGE2); + mgr.markPendingAsFlushed(); + mgr.markFlushedAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(MESSAGE2, mgr.listCommitted().poll()); + + // verify that multiple calls to markFlushedAsCommitted overwrite old states + mgr.addState(MESSAGE1); + mgr.markPendingAsFlushed(); + mgr.markFlushedAsCommitted(); + mgr.markFlushedAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(MESSAGE1, mgr.listCommitted().poll()); + } + + /* + * This change follows the same changes in DestStreamStateLifecycleManager where the goal is to + * confirm that `markPendingAsCommitted` combines what was previous `markPendingAsFlushed` and + * `markFlushedAsCommitted` + * + * The reason for this method is due to destination checkpointing will no longer hold into a state + * as "Flushed" but immediately commit records to the destination's final table + */ + @Test + void testMarkPendingAsCommitted() { + mgr.addState(MESSAGE1); + mgr.addState(MESSAGE2); + mgr.markPendingAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(MESSAGE2, mgr.listCommitted().poll()); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStreamStateLifecycleManagerTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStreamStateLifecycleManagerTest.java new file mode 100644 index 0000000000000..b24350b969c56 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/dest_state_lifecycle_manager/DestStreamStateLifecycleManagerTest.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.dest_state_lifecycle_manager; + +import static org.junit.jupiter.api.Assertions.*; + +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteMessage.Type; +import io.airbyte.protocol.models.v0.AirbyteStateMessage; +import io.airbyte.protocol.models.v0.AirbyteStateMessage.AirbyteStateType; +import io.airbyte.protocol.models.v0.AirbyteStreamState; +import io.airbyte.protocol.models.v0.StreamDescriptor; +import java.util.LinkedList; +import java.util.List; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class DestStreamStateLifecycleManagerTest { + + private static final AirbyteMessage STREAM1_MESSAGE1 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage() + .withType(AirbyteStateType.STREAM) + .withStream(new AirbyteStreamState().withStreamDescriptor(new StreamDescriptor().withName("apples")).withStreamState(Jsons.jsonNode("a")))); + private static final AirbyteMessage STREAM1_MESSAGE2 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage() + .withType(AirbyteStateType.STREAM) + .withStream(new AirbyteStreamState().withStreamDescriptor(new StreamDescriptor().withName("apples")).withStreamState(Jsons.jsonNode("b")))); + private static final AirbyteMessage STREAM2_MESSAGE1 = new AirbyteMessage() + .withType(Type.STATE) + .withState(new AirbyteStateMessage() + .withType(AirbyteStateType.STREAM) + .withStream( + new AirbyteStreamState().withStreamDescriptor(new StreamDescriptor().withName("bananas")).withStreamState(Jsons.jsonNode("10")))); + + private DestStreamStateLifecycleManager mgr; + + @BeforeEach + void setup() { + mgr = new DestStreamStateLifecycleManager(); + } + + /** + * Demonstrates expected lifecycle of a state object for documentation purposes. Subsequent test get + * into the details. + */ + @Test + void testBasicLifeCycle() { + // starts with no state. + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertTrue(mgr.listCommitted().isEmpty()); + + mgr.addState(STREAM1_MESSAGE1); + // new state supersedes previous ones. we should only see MESSAGE2 for STREAM1 from here on out. + mgr.addState(STREAM1_MESSAGE2); + // different stream, thus does not interact with messages from STREAM1. + mgr.addState(STREAM2_MESSAGE1); + + // after adding a state, it is in pending only. + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE2, STREAM2_MESSAGE1)), mgr.listPending()); + assertTrue(mgr.listFlushed().isEmpty()); + assertTrue(mgr.listCommitted().isEmpty()); + + mgr.markPendingAsFlushed(); + + // after flushing the state it is in flushed only. + assertTrue(mgr.listPending().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE2, STREAM2_MESSAGE1)), mgr.listFlushed()); + assertTrue(mgr.listCommitted().isEmpty()); + + // after committing the state it is in committed only. + mgr.markFlushedAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE2, STREAM2_MESSAGE1)), mgr.listCommitted()); + } + + @Test + void testPending() { + mgr.addState(STREAM1_MESSAGE1); + mgr.addState(STREAM1_MESSAGE2); + mgr.addState(STREAM2_MESSAGE1); + + // verify the LAST message is returned. + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE2, STREAM2_MESSAGE1)), mgr.listPending()); + assertTrue(mgr.listFlushed().isEmpty()); + assertTrue(mgr.listCommitted().isEmpty()); + } + + /* + * TODO: remove this test after all destination connectors have updated to reflect destination + * checkpointing changes where flush/commit will be bundled into the same operation + */ + @Deprecated + @Test + void testFlushed() { + mgr.addState(STREAM1_MESSAGE1); + mgr.addState(STREAM1_MESSAGE2); + mgr.addState(STREAM2_MESSAGE1); + mgr.markPendingAsFlushed(); + + assertTrue(mgr.listPending().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE2, STREAM2_MESSAGE1)), mgr.listFlushed()); + assertTrue(mgr.listCommitted().isEmpty()); + + // verify that multiple calls to markPendingAsFlushed overwrite old states + mgr.addState(STREAM1_MESSAGE1); + mgr.markPendingAsFlushed(); + mgr.markPendingAsFlushed(); + + assertTrue(mgr.listPending().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE1, STREAM2_MESSAGE1)), mgr.listFlushed()); + assertTrue(mgr.listCommitted().isEmpty()); + } + + @Test + void testCommitted() { + mgr.addState(STREAM1_MESSAGE1); + mgr.addState(STREAM1_MESSAGE2); + mgr.addState(STREAM2_MESSAGE1); + mgr.markPendingAsFlushed(); + mgr.markFlushedAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE2, STREAM2_MESSAGE1)), mgr.listCommitted()); + + // verify that multiple calls to markFlushedAsCommitted overwrite old states + mgr.addState(STREAM1_MESSAGE1); + mgr.markPendingAsFlushed(); + mgr.markFlushedAsCommitted(); + mgr.markFlushedAsCommitted(); + + assertTrue(mgr.listPending().isEmpty()); + assertTrue(mgr.listFlushed().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE1, STREAM2_MESSAGE1)), mgr.listCommitted()); + } + + /* + * This section is to test for logic that is isolated to changes with respect to destination + * checkpointing where it captures flush and commit are bundled into a transaction so + * + * buffer -(flush buffer)-> staging area -(copy into {staging_file})-> destination raw table + */ + @Test + void testPendingAsCommitted() { + mgr.addState(STREAM1_MESSAGE1); + mgr.markPendingAsCommitted(); + + // verifies that we've skipped "Flushed" without needing to call `markPendingAsFlushed()` and + // `markFlushedAsCommitted` + assertTrue(mgr.listPending().isEmpty()); + assertEquals(new LinkedList<>(List.of(STREAM1_MESSAGE1)), mgr.listCommitted()); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/record_buffer/InMemoryRecordBufferingStrategyTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/record_buffer/InMemoryRecordBufferingStrategyTest.java new file mode 100644 index 0000000000000..69ede03c8b8a5 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/record_buffer/InMemoryRecordBufferingStrategyTest.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.integrations.destination.buffered_stream_consumer.RecordWriter; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import java.util.List; +import java.util.Optional; +import org.junit.jupiter.api.Test; + +public class InMemoryRecordBufferingStrategyTest { + + private static final JsonNode MESSAGE_DATA = Jsons.deserialize("{ \"field1\": 10000 }"); + // MESSAGE_DATA should be 64 bytes long, size the buffer such as it can contain at least 2 message + // instances + private static final int MAX_QUEUE_SIZE_IN_BYTES = 130; + + @SuppressWarnings("unchecked") + private final RecordWriter recordWriter = mock(RecordWriter.class); + + @Test + public void testBuffering() throws Exception { + final InMemoryRecordBufferingStrategy buffering = new InMemoryRecordBufferingStrategy(recordWriter, MAX_QUEUE_SIZE_IN_BYTES); + final AirbyteStreamNameNamespacePair stream1 = new AirbyteStreamNameNamespacePair("stream1", "namespace"); + final AirbyteStreamNameNamespacePair stream2 = new AirbyteStreamNameNamespacePair("stream2", null); + final AirbyteMessage message1 = generateMessage(stream1); + final AirbyteMessage message2 = generateMessage(stream2); + final AirbyteMessage message3 = generateMessage(stream2); + final AirbyteMessage message4 = generateMessage(stream2); + + assertFalse(buffering.addRecord(stream1, message1).isPresent()); + assertFalse(buffering.addRecord(stream2, message2).isPresent()); + // Buffer still has room + final Optional flushType = buffering.addRecord(stream2, message3); + // Keeps track of this #addRecord since we're expecting a buffer flush & that the flushType + // value will indicate that all buffers were flushed + assertTrue(flushType.isPresent()); + assertEquals(flushType.get(), BufferFlushType.FLUSH_ALL); + // Buffer limit reach, flushing all messages so far before adding the new incoming one + verify(recordWriter, times(1)).accept(stream1, List.of(message1.getRecord())); + verify(recordWriter, times(1)).accept(stream2, List.of(message2.getRecord())); + + buffering.addRecord(stream2, message4); + + // force flush to terminate test + buffering.flushAllBuffers(); + verify(recordWriter, times(1)).accept(stream2, List.of(message3.getRecord(), message4.getRecord())); + } + + private static AirbyteMessage generateMessage(final AirbyteStreamNameNamespacePair stream) { + return new AirbyteMessage().withRecord(new AirbyteRecordMessage() + .withStream(stream.getName()) + .withNamespace(stream.getNamespace()) + .withData(MESSAGE_DATA)); + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/record_buffer/SerializedBufferingStrategyTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/record_buffer/SerializedBufferingStrategyTest.java new file mode 100644 index 0000000000000..b38953c3c25b7 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/destination/record_buffer/SerializedBufferingStrategyTest.java @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.record_buffer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.util.Optional; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class SerializedBufferingStrategyTest { + + private static final JsonNode MESSAGE_DATA = Jsons.deserialize("{ \"field1\": 10000 }"); + private static final String STREAM_1 = "stream1"; + private static final String STREAM_2 = "stream2"; + private static final String STREAM_3 = "stream3"; + private static final String STREAM_4 = "stream4"; + + // we set the limit to hold at most 4 messages of 10b total + private static final long MAX_TOTAL_BUFFER_SIZE_BYTES = 42L; + // we set the limit to hold at most 2 messages of 10b per stream + private static final long MAX_PER_STREAM_BUFFER_SIZE_BYTES = 21L; + + private final ConfiguredAirbyteCatalog catalog = mock(ConfiguredAirbyteCatalog.class); + @SuppressWarnings("unchecked") + private final FlushBufferFunction perStreamFlushHook = mock(FlushBufferFunction.class); + + private final SerializableBuffer recordWriter1 = mock(SerializableBuffer.class); + private final SerializableBuffer recordWriter2 = mock(SerializableBuffer.class); + private final SerializableBuffer recordWriter3 = mock(SerializableBuffer.class); + private final SerializableBuffer recordWriter4 = mock(SerializableBuffer.class); + + @BeforeEach + public void setup() throws Exception { + setupMock(recordWriter1); + setupMock(recordWriter2); + setupMock(recordWriter3); + setupMock(recordWriter4); + } + + private void setupMock(final SerializableBuffer mockObject) throws Exception { + when(mockObject.accept(any())).thenReturn(10L); + when(mockObject.getByteCount()).thenReturn(10L); + when(mockObject.getMaxTotalBufferSizeInBytes()).thenReturn(MAX_TOTAL_BUFFER_SIZE_BYTES); + when(mockObject.getMaxPerStreamBufferSizeInBytes()).thenReturn(MAX_PER_STREAM_BUFFER_SIZE_BYTES); + when(mockObject.getMaxConcurrentStreamsInBuffer()).thenReturn(4); + } + + @Test + public void testPerStreamThresholdFlush() throws Exception { + final SerializedBufferingStrategy buffering = new SerializedBufferingStrategy(onCreateBufferFunction(), catalog, perStreamFlushHook); + final AirbyteStreamNameNamespacePair stream1 = new AirbyteStreamNameNamespacePair(STREAM_1, "namespace"); + final AirbyteStreamNameNamespacePair stream2 = new AirbyteStreamNameNamespacePair(STREAM_2, null); + // To test per stream threshold, we are sending multiple test messages on a single stream + final AirbyteMessage message1 = generateMessage(stream1); + final AirbyteMessage message2 = generateMessage(stream2); + final AirbyteMessage message3 = generateMessage(stream2); + final AirbyteMessage message4 = generateMessage(stream2); + final AirbyteMessage message5 = generateMessage(stream2); + + when(recordWriter1.getByteCount()).thenReturn(10L); // one record in recordWriter1 + assertFalse(buffering.addRecord(stream1, message1).isPresent()); + when(recordWriter2.getByteCount()).thenReturn(10L); // one record in recordWriter2 + assertFalse(buffering.addRecord(stream2, message2).isPresent()); + + // Total and per stream Buffers still have room + verify(perStreamFlushHook, times(0)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(0)).accept(stream2, recordWriter2); + + when(recordWriter2.getByteCount()).thenReturn(20L); // second record in recordWriter2 + assertFalse(buffering.addRecord(stream2, message3).isPresent()); + when(recordWriter2.getByteCount()).thenReturn(30L); // third record in recordWriter2 + + // Buffer reaches limit so a buffer flush occurs returning a buffer flush type of single stream + final Optional flushType = buffering.addRecord(stream2, message4); + assertTrue(flushType.isPresent()); + assertEquals(flushType.get(), BufferFlushType.FLUSH_SINGLE_STREAM); + + // The buffer limit is now reached for stream2, flushing that single stream only + verify(perStreamFlushHook, times(0)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(1)).accept(stream2, recordWriter2); + + when(recordWriter2.getByteCount()).thenReturn(10L); // back to one record in recordWriter2 + assertFalse(buffering.addRecord(stream2, message5).isPresent()); + + // force flush to terminate test + buffering.flushAllBuffers(); + verify(perStreamFlushHook, times(1)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(2)).accept(stream2, recordWriter2); + } + + @Test + public void testTotalStreamThresholdFlush() throws Exception { + final SerializedBufferingStrategy buffering = new SerializedBufferingStrategy(onCreateBufferFunction(), catalog, perStreamFlushHook); + final AirbyteStreamNameNamespacePair stream1 = new AirbyteStreamNameNamespacePair(STREAM_1, "namespace"); + final AirbyteStreamNameNamespacePair stream2 = new AirbyteStreamNameNamespacePair(STREAM_2, "namespace"); + final AirbyteStreamNameNamespacePair stream3 = new AirbyteStreamNameNamespacePair(STREAM_3, "namespace"); + // To test total stream threshold, we are sending test messages to multiple streams without reaching + // per stream limits + final AirbyteMessage message1 = generateMessage(stream1); + final AirbyteMessage message2 = generateMessage(stream2); + final AirbyteMessage message3 = generateMessage(stream3); + final AirbyteMessage message4 = generateMessage(stream1); + final AirbyteMessage message5 = generateMessage(stream2); + final AirbyteMessage message6 = generateMessage(stream3); + + assertFalse(buffering.addRecord(stream1, message1).isPresent()); + assertFalse(buffering.addRecord(stream2, message2).isPresent()); + // Total and per stream Buffers still have room + verify(perStreamFlushHook, times(0)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(0)).accept(stream2, recordWriter2); + verify(perStreamFlushHook, times(0)).accept(stream3, recordWriter3); + + assertFalse(buffering.addRecord(stream3, message3).isPresent()); + when(recordWriter1.getByteCount()).thenReturn(20L); // second record in recordWriter1 + assertFalse(buffering.addRecord(stream1, message4).isPresent()); + when(recordWriter2.getByteCount()).thenReturn(20L); // second record in recordWriter2 + + // In response to checkpointing, will need to know what type of buffer flush occurred to mark + // AirbyteStateMessage as committed depending on DestDefaultStateLifecycleManager + final Optional flushType = buffering.addRecord(stream2, message5); + assertTrue(flushType.isPresent()); + assertEquals(flushType.get(), BufferFlushType.FLUSH_ALL); + + // Buffer limit reached for total streams, flushing all streams + verify(perStreamFlushHook, times(1)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(1)).accept(stream2, recordWriter2); + verify(perStreamFlushHook, times(1)).accept(stream3, recordWriter3); + + assertFalse(buffering.addRecord(stream3, message6).isPresent()); + // force flush to terminate test + buffering.flushAllBuffers(); + verify(perStreamFlushHook, times(1)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(1)).accept(stream2, recordWriter2); + verify(perStreamFlushHook, times(2)).accept(stream3, recordWriter3); + } + + @Test + public void testConcurrentStreamThresholdFlush() throws Exception { + final SerializedBufferingStrategy buffering = new SerializedBufferingStrategy(onCreateBufferFunction(), catalog, perStreamFlushHook); + final AirbyteStreamNameNamespacePair stream1 = new AirbyteStreamNameNamespacePair(STREAM_1, "namespace1"); + final AirbyteStreamNameNamespacePair stream2 = new AirbyteStreamNameNamespacePair(STREAM_2, "namespace2"); + final AirbyteStreamNameNamespacePair stream3 = new AirbyteStreamNameNamespacePair(STREAM_3, null); + final AirbyteStreamNameNamespacePair stream4 = new AirbyteStreamNameNamespacePair(STREAM_4, null); + // To test concurrent stream threshold, we are sending test messages to multiple streams + final AirbyteMessage message1 = generateMessage(stream1); + final AirbyteMessage message2 = generateMessage(stream2); + final AirbyteMessage message3 = generateMessage(stream3); + final AirbyteMessage message4 = generateMessage(stream4); + final AirbyteMessage message5 = generateMessage(stream1); + + assertFalse(buffering.addRecord(stream1, message1).isPresent()); + assertFalse(buffering.addRecord(stream2, message2).isPresent()); + assertFalse(buffering.addRecord(stream3, message3).isPresent()); + // Total and per stream Buffers still have room + verify(perStreamFlushHook, times(0)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(0)).accept(stream2, recordWriter2); + verify(perStreamFlushHook, times(0)).accept(stream3, recordWriter3); + + // Since the concurrent stream threshold has been exceeded, all buffer streams are flush + final Optional flushType = buffering.addRecord(stream4, message4); + assertTrue(flushType.isPresent()); + assertEquals(flushType.get(), BufferFlushType.FLUSH_ALL); + + // Buffer limit reached for concurrent streams, flushing all streams + verify(perStreamFlushHook, times(1)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(1)).accept(stream2, recordWriter2); + verify(perStreamFlushHook, times(1)).accept(stream3, recordWriter3); + verify(perStreamFlushHook, times(1)).accept(stream4, recordWriter4); + + assertFalse(buffering.addRecord(stream1, message5).isPresent()); + // force flush to terminate test + buffering.flushAllBuffers(); + verify(perStreamFlushHook, times(2)).accept(stream1, recordWriter1); + verify(perStreamFlushHook, times(1)).accept(stream2, recordWriter2); + verify(perStreamFlushHook, times(1)).accept(stream3, recordWriter3); + verify(perStreamFlushHook, times(1)).accept(stream4, recordWriter4); + } + + @Test + public void testCreateBufferFailure() { + final SerializedBufferingStrategy buffering = new SerializedBufferingStrategy(onCreateBufferFunction(), catalog, perStreamFlushHook); + final AirbyteStreamNameNamespacePair stream = new AirbyteStreamNameNamespacePair("unknown_stream", "namespace1"); + assertThrows(RuntimeException.class, () -> buffering.addRecord(stream, generateMessage(stream))); + } + + private static AirbyteMessage generateMessage(final AirbyteStreamNameNamespacePair stream) { + return new AirbyteMessage().withRecord(new AirbyteRecordMessage() + .withStream(stream.getName()) + .withNamespace(stream.getNamespace()) + .withData(MESSAGE_DATA)); + } + + private BufferCreateFunction onCreateBufferFunction() { + return (stream, catalog) -> switch (stream.getName()) { + case STREAM_1 -> recordWriter1; + case STREAM_2 -> recordWriter2; + case STREAM_3 -> recordWriter3; + case STREAM_4 -> recordWriter4; + default -> null; + }; + } + +} diff --git a/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/util/ConnectorExceptionUtilTest.java b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/util/ConnectorExceptionUtilTest.java new file mode 100644 index 0000000000000..5371299b4cc61 --- /dev/null +++ b/airbyte-integrations/bases/base-java-async/src/test/java/io/airbyte/integrations/util/ConnectorExceptionUtilTest.java @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.util; + +import static io.airbyte.integrations.util.ConnectorExceptionUtil.COMMON_EXCEPTION_MESSAGE_TEMPLATE; +import static io.airbyte.integrations.util.ConnectorExceptionUtil.RECOVERY_CONNECTION_ERROR_MESSAGE; +import static org.junit.jupiter.api.Assertions.*; + +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.commons.exceptions.ConnectionErrorException; +import java.sql.SQLException; +import java.sql.SQLSyntaxErrorException; +import org.junit.jupiter.api.Test; + +class ConnectorExceptionUtilTest { + + public static final String CONFIG_EXCEPTION_MESSAGE = "test message"; + public static final String RECOVERY_EXCEPTION_MESSAGE = "FATAL: terminating connection due to conflict with recovery"; + public static final String COMMON_EXCEPTION_MESSAGE = "something happens with connection"; + public static final String CONNECTION_ERROR_MESSAGE_TEMPLATE = "State code: %s; Error code: %s; Message: %s"; + public static final String UNKNOWN_COLUMN_SQL_EXCEPTION_MESSAGE = "Unknown column 'table.column' in 'field list'"; + + @Test() + void isConfigErrorForConfigException() { + ConfigErrorException configErrorException = new ConfigErrorException(CONFIG_EXCEPTION_MESSAGE); + assertTrue(ConnectorExceptionUtil.isConfigError(configErrorException)); + + } + + @Test + void isConfigErrorForConnectionException() { + ConnectionErrorException connectionErrorException = new ConnectionErrorException(CONFIG_EXCEPTION_MESSAGE); + assertTrue(ConnectorExceptionUtil.isConfigError(connectionErrorException)); + } + + @Test + void isConfigErrorForRecoveryPSQLException() { + SQLException recoveryPSQLException = new SQLException(RECOVERY_EXCEPTION_MESSAGE); + assertTrue(ConnectorExceptionUtil.isConfigError(recoveryPSQLException)); + } + + @Test + void isConfigErrorForUnknownColumnSQLSyntaxErrorException() { + SQLSyntaxErrorException unknownColumnSQLSyntaxErrorException = new SQLSyntaxErrorException(UNKNOWN_COLUMN_SQL_EXCEPTION_MESSAGE); + assertTrue(ConnectorExceptionUtil.isConfigError(unknownColumnSQLSyntaxErrorException)); + } + + @Test + void isConfigErrorForCommonSQLException() { + SQLException recoveryPSQLException = new SQLException(COMMON_EXCEPTION_MESSAGE); + assertFalse(ConnectorExceptionUtil.isConfigError(recoveryPSQLException)); + } + + @Test + void isConfigErrorForCommonException() { + assertFalse(ConnectorExceptionUtil.isConfigError(new Exception())); + } + + @Test + void getDisplayMessageForConfigException() { + ConfigErrorException configErrorException = new ConfigErrorException(CONFIG_EXCEPTION_MESSAGE); + String actualDisplayMessage = ConnectorExceptionUtil.getDisplayMessage(configErrorException); + assertEquals(CONFIG_EXCEPTION_MESSAGE, actualDisplayMessage); + } + + @Test + void getDisplayMessageForConnectionError() { + String testCode = "test code"; + int errorCode = -1; + ConnectionErrorException connectionErrorException = new ConnectionErrorException(testCode, errorCode, CONFIG_EXCEPTION_MESSAGE, new Exception()); + String actualDisplayMessage = ConnectorExceptionUtil.getDisplayMessage(connectionErrorException); + assertEquals(String.format(CONNECTION_ERROR_MESSAGE_TEMPLATE, testCode, errorCode, CONFIG_EXCEPTION_MESSAGE), actualDisplayMessage); + } + + @Test + void getDisplayMessageForRecoveryException() { + SQLException recoveryException = new SQLException(RECOVERY_EXCEPTION_MESSAGE); + String actualDisplayMessage = ConnectorExceptionUtil.getDisplayMessage(recoveryException); + assertEquals(RECOVERY_CONNECTION_ERROR_MESSAGE, actualDisplayMessage); + } + + @Test + void getDisplayMessageForUnknownSQLErrorException() { + SQLSyntaxErrorException unknownColumnSQLSyntaxErrorException = new SQLSyntaxErrorException(UNKNOWN_COLUMN_SQL_EXCEPTION_MESSAGE); + String actualDisplayMessage = ConnectorExceptionUtil.getDisplayMessage(unknownColumnSQLSyntaxErrorException); + assertEquals(UNKNOWN_COLUMN_SQL_EXCEPTION_MESSAGE, actualDisplayMessage); + } + + @Test + void getDisplayMessageForCommonException() { + Exception exception = new SQLException(COMMON_EXCEPTION_MESSAGE); + String actualDisplayMessage = ConnectorExceptionUtil.getDisplayMessage(exception); + assertEquals(String.format(COMMON_EXCEPTION_MESSAGE_TEMPLATE, COMMON_EXCEPTION_MESSAGE), actualDisplayMessage); + } + + @Test + void getRootConfigErrorFromConfigException() { + ConfigErrorException configErrorException = new ConfigErrorException(CONFIG_EXCEPTION_MESSAGE); + Exception exception = new Exception(COMMON_EXCEPTION_MESSAGE, configErrorException); + + Throwable actualRootConfigError = ConnectorExceptionUtil.getRootConfigError(exception); + assertEquals(configErrorException, actualRootConfigError); + } + + @Test + void getRootConfigErrorFromRecoverySQLException() { + SQLException recoveryException = new SQLException(RECOVERY_EXCEPTION_MESSAGE); + RuntimeException runtimeException = new RuntimeException(COMMON_EXCEPTION_MESSAGE, recoveryException); + Exception exception = new Exception(runtimeException); + + Throwable actualRootConfigError = ConnectorExceptionUtil.getRootConfigError(exception); + assertEquals(recoveryException, actualRootConfigError); + } + + @Test + void getRootConfigErrorFromUnknownSQLErrorException() { + SQLException unknownSQLErrorException = new SQLSyntaxErrorException(UNKNOWN_COLUMN_SQL_EXCEPTION_MESSAGE); + RuntimeException runtimeException = new RuntimeException(COMMON_EXCEPTION_MESSAGE, unknownSQLErrorException); + Exception exception = new Exception(runtimeException); + + Throwable actualRootConfigError = ConnectorExceptionUtil.getRootConfigError(exception); + assertEquals(unknownSQLErrorException, actualRootConfigError); + } + + @Test + void getRootConfigErrorFromNonConfigException() { + SQLException configErrorException = new SQLException(CONFIG_EXCEPTION_MESSAGE); + Exception exception = new Exception(COMMON_EXCEPTION_MESSAGE, configErrorException); + + Throwable actualRootConfigError = ConnectorExceptionUtil.getRootConfigError(exception); + assertEquals(exception, actualRootConfigError); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/build.gradle b/airbyte-integrations/bases/bases-destination-jdbc-async/build.gradle new file mode 100644 index 0000000000000..57c98181a3e96 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/build.gradle @@ -0,0 +1,34 @@ +plugins { + id 'application' + id 'airbyte-docker' + id 'airbyte-integration-test-java' +} + +dependencies { + implementation 'com.google.cloud:google-cloud-storage:1.113.16' + implementation 'com.google.auth:google-auth-library-oauth2-http:0.25.5' + + implementation project(':airbyte-db:db-lib') + implementation project(':airbyte-integrations:bases:base-java') + implementation project(':airbyte-integrations:bases:base-java-s3') + implementation libs.airbyte.protocol + + implementation 'org.apache.commons:commons-lang3:3.11' + implementation 'org.apache.commons:commons-csv:1.4' + implementation 'com.github.alexmojaki:s3-stream-upload:2.2.2' + implementation 'com.fasterxml.jackson.core:jackson-databind' + implementation 'com.azure:azure-storage-blob:12.12.0' + +// A small utility library for working with units of digital information +// https://github.com/aesy/datasize + implementation "io.aesy:datasize:1.0.0" + + testImplementation libs.connectors.testcontainers.postgresql + testImplementation "org.mockito:mockito-inline:4.1.0" + + integrationTestJavaImplementation project(':airbyte-integrations:bases:standard-destination-test') + integrationTestJavaImplementation libs.connectors.testcontainers.postgresql + + implementation files(project(':airbyte-integrations:bases:base-java').airbyteDocker.outputs) + integrationTestJavaImplementation files(project(':airbyte-integrations:bases:base-normalization').airbyteDocker.outputs) +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/readme.md b/airbyte-integrations/bases/bases-destination-jdbc-async/readme.md new file mode 100644 index 0000000000000..90924191b4be8 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/readme.md @@ -0,0 +1,7 @@ +# JDBC Destination + +We are not planning to expose this destination in the UI yet. It serves as a base upon which we can build all of our other JDBC-compliant destinations. + +The reasons we are not exposing this destination by itself are: +1. It is not terribly user-friendly (jdbc urls are hard for a human to parse) +1. Each JDBC-compliant db, we need to make sure the appropriate drivers are installed on the image. We don't want to frontload installing all possible drivers, and instead would like to be more methodical. Instead for each JDBC-compliant destination, we will extend this one and then install only the necessary JDBC drivers on that destination's image. diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/AbstractJdbcDestination.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/AbstractJdbcDestination.java new file mode 100644 index 0000000000000..28079f9f50ea5 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/AbstractJdbcDestination.java @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import static io.airbyte.integrations.base.errors.messages.ErrorMessage.getErrorMessage; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.exceptions.ConnectionErrorException; +import io.airbyte.commons.json.Jsons; +import io.airbyte.commons.map.MoreMaps; +import io.airbyte.db.factory.DataSourceFactory; +import io.airbyte.db.jdbc.DefaultJdbcDatabase; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.db.jdbc.JdbcUtils; +import io.airbyte.integrations.BaseConnector; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.base.AirbyteTraceMessageUtility; +import io.airbyte.integrations.base.Destination; +import io.airbyte.integrations.destination.NamingConventionTransformer; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus.Status; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.sql.SQLException; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.UUID; +import java.util.function.Consumer; +import javax.sql.DataSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class AbstractJdbcDestination extends BaseConnector implements Destination { + + private static final Logger LOGGER = LoggerFactory.getLogger(AbstractJdbcDestination.class); + + private final String driverClass; + private final NamingConventionTransformer namingResolver; + private final SqlOperations sqlOperations; + + protected NamingConventionTransformer getNamingResolver() { + return namingResolver; + } + + protected SqlOperations getSqlOperations() { + return sqlOperations; + } + + public AbstractJdbcDestination(final String driverClass, + final NamingConventionTransformer namingResolver, + final SqlOperations sqlOperations) { + this.driverClass = driverClass; + this.namingResolver = namingResolver; + this.sqlOperations = sqlOperations; + } + + @Override + public AirbyteConnectionStatus check(final JsonNode config) { + final DataSource dataSource = getDataSource(config); + + try { + final JdbcDatabase database = getDatabase(dataSource); + final String outputSchema = namingResolver.getIdentifier(config.get(JdbcUtils.SCHEMA_KEY).asText()); + attemptSQLCreateAndDropTableOperations(outputSchema, database, namingResolver, sqlOperations); + return new AirbyteConnectionStatus().withStatus(Status.SUCCEEDED); + } catch (final ConnectionErrorException ex) { + final String message = getErrorMessage(ex.getStateCode(), ex.getErrorCode(), ex.getExceptionMessage(), ex); + AirbyteTraceMessageUtility.emitConfigErrorTrace(ex, message); + return new AirbyteConnectionStatus() + .withStatus(Status.FAILED) + .withMessage(message); + } catch (final Exception e) { + LOGGER.error("Exception while checking connection: ", e); + return new AirbyteConnectionStatus() + .withStatus(Status.FAILED) + .withMessage("Could not connect with provided configuration. \n" + e.getMessage()); + } finally { + try { + DataSourceFactory.close(dataSource); + } catch (final Exception e) { + LOGGER.warn("Unable to close data source.", e); + } + } + } + + /** + * This method is deprecated. It verifies table creation, but not insert right to a newly created + * table. Use attemptTableOperations with the attemptInsert argument instead. + */ + @Deprecated + public static void attemptSQLCreateAndDropTableOperations(final String outputSchema, + final JdbcDatabase database, + final NamingConventionTransformer namingResolver, + final SqlOperations sqlOps) + throws Exception { + attemptTableOperations(outputSchema, database, namingResolver, sqlOps, false); + } + + /** + * Verifies if provided creds has enough permissions. Steps are: 1. Create schema if not exists. 2. + * Create test table. 3. Insert dummy record to newly created table if "attemptInsert" set to true. + * 4. Delete table created on step 2. + * + * @param outputSchema - schema to tests against. + * @param database - database to tests against. + * @param namingResolver - naming resolver. + * @param sqlOps - SqlOperations object + * @param attemptInsert - set true if need to make attempt to insert dummy records to newly created + * table. Set false to skip insert step. + * @throws Exception + */ + public static void attemptTableOperations(final String outputSchema, + final JdbcDatabase database, + final NamingConventionTransformer namingResolver, + final SqlOperations sqlOps, + final boolean attemptInsert) + throws Exception { + // verify we have write permissions on the target schema by creating a table with a random name, + // then dropping that table + try { + // Get metadata from the database to see whether connection is possible + database.bufferedResultSetQuery(conn -> conn.getMetaData().getCatalogs(), JdbcUtils.getDefaultSourceOperations()::rowToJson); + + // verify we have write permissions on the target schema by creating a table with a random name, + // then dropping that table + final String outputTableName = namingResolver.getIdentifier("_airbyte_connection_test_" + UUID.randomUUID().toString().replaceAll("-", "")); + sqlOps.createSchemaIfNotExists(database, outputSchema); + sqlOps.createTableIfNotExists(database, outputSchema, outputTableName); + // verify if user has permission to make SQL INSERT queries + try { + if (attemptInsert) { + sqlOps.insertRecords(database, List.of(getDummyRecord()), outputSchema, outputTableName); + } + } finally { + sqlOps.dropTableIfExists(database, outputSchema, outputTableName); + } + } catch (final SQLException e) { + if (Objects.isNull(e.getCause()) || !(e.getCause() instanceof SQLException)) { + throw new ConnectionErrorException(e.getSQLState(), e.getErrorCode(), e.getMessage(), e); + } else { + final SQLException cause = (SQLException) e.getCause(); + throw new ConnectionErrorException(e.getSQLState(), cause.getErrorCode(), cause.getMessage(), e); + } + } catch (final Exception e) { + throw new Exception(e); + } + } + + /** + * Generates a dummy AirbyteRecordMessage with random values. + * + * @return AirbyteRecordMessage object with dummy values that may be used to test insert permission. + */ + private static AirbyteRecordMessage getDummyRecord() { + final JsonNode dummyDataToInsert = Jsons.deserialize("{ \"field1\": true }"); + return new AirbyteRecordMessage() + .withStream("stream1") + .withData(dummyDataToInsert) + .withEmittedAt(1602637589000L); + } + + protected DataSource getDataSource(final JsonNode config) { + final JsonNode jdbcConfig = toJdbcConfig(config); + return DataSourceFactory.create( + jdbcConfig.get(JdbcUtils.USERNAME_KEY).asText(), + jdbcConfig.has(JdbcUtils.PASSWORD_KEY) ? jdbcConfig.get(JdbcUtils.PASSWORD_KEY).asText() : null, + driverClass, + jdbcConfig.get(JdbcUtils.JDBC_URL_KEY).asText(), + getConnectionProperties(config)); + } + + protected JdbcDatabase getDatabase(final DataSource dataSource) { + return new DefaultJdbcDatabase(dataSource); + } + + protected Map getConnectionProperties(final JsonNode config) { + final Map customProperties = JdbcUtils.parseJdbcParameters(config, JdbcUtils.JDBC_URL_PARAMS_KEY); + final Map defaultProperties = getDefaultConnectionProperties(config); + assertCustomParametersDontOverwriteDefaultParameters(customProperties, defaultProperties); + return MoreMaps.merge(customProperties, defaultProperties); + } + + private void assertCustomParametersDontOverwriteDefaultParameters(final Map customParameters, + final Map defaultParameters) { + for (final String key : defaultParameters.keySet()) { + if (customParameters.containsKey(key) && !Objects.equals(customParameters.get(key), defaultParameters.get(key))) { + throw new IllegalArgumentException("Cannot overwrite default JDBC parameter " + key); + } + } + } + + protected abstract Map getDefaultConnectionProperties(final JsonNode config); + + public abstract JsonNode toJdbcConfig(JsonNode config); + + @Override + public AirbyteMessageConsumer getConsumer(final JsonNode config, + final ConfiguredAirbyteCatalog catalog, + final Consumer outputRecordCollector) { + return JdbcBufferedConsumerFactory.create(outputRecordCollector, getDatabase(getDataSource(config)), sqlOperations, namingResolver, config, + catalog); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/DataAdapter.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/DataAdapter.java new file mode 100644 index 0000000000000..c445eeddacdd4 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/DataAdapter.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.function.Function; +import java.util.function.Predicate; + +public class DataAdapter { + + private final Predicate filterValueNode; + private final Function valueNodeAdapter; + + /** + * Data adapter allows applying destination data rules. For example, Postgres destination can't + * process text value with \u0000 unicode. You can describe filter condition for a value node and + * function which adapts filtered value nodes. + * + * @param filterValueNode - filter condition which decide which value node should be adapted + * @param valueNodeAdapter - transformation function which returns adapted value node + */ + public DataAdapter( + final Predicate filterValueNode, + final Function valueNodeAdapter) { + this.filterValueNode = filterValueNode; + this.valueNodeAdapter = valueNodeAdapter; + } + + public void adapt(final JsonNode messageData) { + if (messageData != null) { + adaptAllValueNodes(messageData); + } + } + + private void adaptAllValueNodes(final JsonNode rootNode) { + adaptValueNodes(null, rootNode, null); + } + + /** + * The method inspects json node. In case, it's a value node we check the node by CheckFunction and + * apply ValueNodeAdapter. Filtered nodes will be updated by adapted version. If element is an array + * or an object, this we run the method recursively for them. + * + * @param fieldName Name of a json node + * @param node Json node + * @param parentNode Parent json node + */ + private void adaptValueNodes(final String fieldName, final JsonNode node, final JsonNode parentNode) { + if (node.isValueNode() && filterValueNode.test(node)) { + if (fieldName != null) { + final var adaptedNode = valueNodeAdapter.apply(node); + ((ObjectNode) parentNode).set(fieldName, adaptedNode); + } else + throw new RuntimeException("Unexpected value node without fieldName. Node: " + node); + } else if (node.isArray()) { + node.elements().forEachRemaining(arrayNode -> adaptValueNodes(null, arrayNode, node)); + } else { + node.fields().forEachRemaining(stringJsonNodeEntry -> adaptValueNodes(stringJsonNodeEntry.getKey(), stringJsonNodeEntry.getValue(), node)); + } + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/JdbcBufferedConsumerFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/JdbcBufferedConsumerFactory.java new file mode 100644 index 0000000000000..70d53fd806a47 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/JdbcBufferedConsumerFactory.java @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import static io.airbyte.integrations.destination.jdbc.constants.GlobalDataSizeConstants.DEFAULT_MAX_BATCH_SIZE_BYTES; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.base.Preconditions; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.db.jdbc.JdbcUtils; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.destination.NamingConventionTransformer; +import io.airbyte.integrations.destination.buffered_stream_consumer.BufferedStreamConsumer; +import io.airbyte.integrations.destination.buffered_stream_consumer.OnCloseFunction; +import io.airbyte.integrations.destination.buffered_stream_consumer.OnStartFunction; +import io.airbyte.integrations.destination.buffered_stream_consumer.RecordWriter; +import io.airbyte.integrations.destination.record_buffer.InMemoryRecordBufferingStrategy; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Strategy: + *

+ * 1. Create a final table for each stream + *

+ * 2. Accumulate records in a buffer. One buffer per stream + *

+ * 3. As records accumulate write them in batch to the database. We set a minimum numbers of records + * before writing to avoid wasteful record-wise writes. In the case with slow syncs this will be + * superseded with a periodic record flush from {@link BufferedStreamConsumer#periodicBufferFlush()} + *

+ * 4. Once all records have been written to buffer, flush the buffer and write any remaining records + * to the database (regardless of how few are left) + */ +public class JdbcBufferedConsumerFactory { + + private static final Logger LOGGER = LoggerFactory.getLogger(JdbcBufferedConsumerFactory.class); + + public static AirbyteMessageConsumer create(final Consumer outputRecordCollector, + final JdbcDatabase database, + final SqlOperations sqlOperations, + final NamingConventionTransformer namingResolver, + final JsonNode config, + final ConfiguredAirbyteCatalog catalog) { + final List writeConfigs = createWriteConfigs(namingResolver, config, catalog, sqlOperations.isSchemaRequired()); + + return new BufferedStreamConsumer( + outputRecordCollector, + onStartFunction(database, sqlOperations, writeConfigs), + new InMemoryRecordBufferingStrategy(recordWriterFunction(database, sqlOperations, writeConfigs, catalog), DEFAULT_MAX_BATCH_SIZE_BYTES), + onCloseFunction(database, sqlOperations, writeConfigs), + catalog, + sqlOperations::isValidData); + } + + private static List createWriteConfigs(final NamingConventionTransformer namingResolver, + final JsonNode config, + final ConfiguredAirbyteCatalog catalog, + final boolean schemaRequired) { + if (schemaRequired) { + Preconditions.checkState(config.has("schema"), "jdbc destinations must specify a schema."); + } + final Instant now = Instant.now(); + return catalog.getStreams().stream().map(toWriteConfig(namingResolver, config, now, schemaRequired)).collect(Collectors.toList()); + } + + private static Function toWriteConfig( + final NamingConventionTransformer namingResolver, + final JsonNode config, + final Instant now, + final boolean schemaRequired) { + return stream -> { + Preconditions.checkNotNull(stream.getDestinationSyncMode(), "Undefined destination sync mode"); + final AirbyteStream abStream = stream.getStream(); + + final String defaultSchemaName = schemaRequired ? namingResolver.getIdentifier(config.get("schema").asText()) + : namingResolver.getIdentifier(config.get(JdbcUtils.DATABASE_KEY).asText()); + final String outputSchema = getOutputSchema(abStream, defaultSchemaName, namingResolver); + + final String streamName = abStream.getName(); + final String tableName = namingResolver.getRawTableName(streamName); + final String tmpTableName = namingResolver.getTmpTableName(streamName); + final DestinationSyncMode syncMode = stream.getDestinationSyncMode(); + + final WriteConfig writeConfig = new WriteConfig(streamName, abStream.getNamespace(), outputSchema, tmpTableName, tableName, syncMode); + LOGGER.info("Write config: {}", writeConfig); + + return writeConfig; + }; + } + + /** + * Defer to the {@link AirbyteStream}'s namespace. If this is not set, use the destination's default + * schema. This namespace is source-provided, and can be potentially empty. + *

+ * The logic here matches the logic in the catalog_process.py for Normalization. Any modifications + * need to be reflected there and vice versa. + */ + private static String getOutputSchema(final AirbyteStream stream, + final String defaultDestSchema, + final NamingConventionTransformer namingResolver) { + return stream.getNamespace() != null + ? namingResolver.getNamespace(stream.getNamespace()) + : namingResolver.getNamespace(defaultDestSchema); + } + + /** + * Sets up destination storage through: + *

+ * 1. Creates Schema (if not exists) + *

+ * 2. Creates airybte_raw table (if not exists) + *

+ * 3. Truncates table if sync mode is in OVERWRITE + * + * @param database JDBC database to connect to + * @param sqlOperations interface for execution SQL queries + * @param writeConfigs settings for each stream + * @return + */ + private static OnStartFunction onStartFunction(final JdbcDatabase database, + final SqlOperations sqlOperations, + final List writeConfigs) { + return () -> { + LOGGER.info("Preparing raw tables in destination started for {} streams", writeConfigs.size()); + final List queryList = new ArrayList<>(); + for (final WriteConfig writeConfig : writeConfigs) { + final String schemaName = writeConfig.getOutputSchemaName(); + final String dstTableName = writeConfig.getOutputTableName(); + LOGGER.info("Preparing raw table in destination started for stream {}. schema: {}, table name: {}", + writeConfig.getStreamName(), + schemaName, + dstTableName); + sqlOperations.createSchemaIfNotExists(database, schemaName); + sqlOperations.createTableIfNotExists(database, schemaName, dstTableName); + switch (writeConfig.getSyncMode()) { + case OVERWRITE -> queryList.add(sqlOperations.truncateTableQuery(database, schemaName, dstTableName)); + case APPEND, APPEND_DEDUP -> {} + default -> throw new IllegalStateException("Unrecognized sync mode: " + writeConfig.getSyncMode()); + } + } + sqlOperations.executeTransaction(database, queryList); + LOGGER.info("Preparing raw tables in destination completed."); + }; + } + + /** + * Writes {@link AirbyteRecordMessage} to JDBC database's airbyte_raw table + * + * @param database JDBC database to connect to + * @param sqlOperations interface of SQL queries to execute + * @param writeConfigs settings for each stream + * @param catalog catalog of all streams to sync + * @return + */ + private static RecordWriter recordWriterFunction(final JdbcDatabase database, + final SqlOperations sqlOperations, + final List writeConfigs, + final ConfiguredAirbyteCatalog catalog) { + final Map pairToWriteConfig = writeConfigs.stream() + .collect(Collectors.toUnmodifiableMap(JdbcBufferedConsumerFactory::toNameNamespacePair, Function.identity())); + + return (pair, records) -> { + if (!pairToWriteConfig.containsKey(pair)) { + throw new IllegalArgumentException( + String.format("Message contained record from a stream that was not in the catalog. \ncatalog: %s", Jsons.serialize(catalog))); + } + + final WriteConfig writeConfig = pairToWriteConfig.get(pair); + sqlOperations.insertRecords(database, records, writeConfig.getOutputSchemaName(), writeConfig.getOutputTableName()); + }; + } + + /** + * Closes connection to JDBC database and other tear down functionality + * + * @param database JDBC database to connect to + * @param sqlOperations interface used to execute SQL queries + * @param writeConfigs settings for each stream + * @return + */ + private static OnCloseFunction onCloseFunction(final JdbcDatabase database, + final SqlOperations sqlOperations, + final List writeConfigs) { + return (hasFailed) -> { + if (!hasFailed) { + sqlOperations.onDestinationCloseOperations(database, writeConfigs); + } + }; + } + + private static AirbyteStreamNameNamespacePair toNameNamespacePair(final WriteConfig config) { + return new AirbyteStreamNameNamespacePair(config.getStreamName(), config.getNamespace()); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/JdbcSqlOperations.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/JdbcSqlOperations.java new file mode 100644 index 0000000000000..522c5fda8c47b --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/JdbcSqlOperations.java @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.base.JavaBaseConstants; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.io.File; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.time.Instant; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; + +@SuppressWarnings("OptionalUsedAsFieldOrParameterType") +public abstract class JdbcSqlOperations implements SqlOperations { + + protected static final String SHOW_SCHEMAS = "show schemas;"; + protected static final String NAME = "name"; + + // this adapter modifies record message before inserting them to the destination + protected final Optional dataAdapter; + private final Set schemaSet = new HashSet<>(); + + protected JdbcSqlOperations() { + this.dataAdapter = Optional.empty(); + } + + protected JdbcSqlOperations(final DataAdapter dataAdapter) { + this.dataAdapter = Optional.of(dataAdapter); + } + + @Override + public void createSchemaIfNotExists(final JdbcDatabase database, final String schemaName) throws Exception { + try { + if (!schemaSet.contains(schemaName) && !isSchemaExists(database, schemaName)) { + database.execute(String.format("CREATE SCHEMA IF NOT EXISTS %s;", schemaName)); + schemaSet.add(schemaName); + } + } catch (Exception e) { + throw checkForKnownConfigExceptions(e).orElseThrow(() -> e); + } + } + + /** + * When an exception occurs, we may recognize it as an issue with the users permissions or other + * configuration options. In these cases, we can wrap the exception in a + * {@link ConfigErrorException} which will exclude the error from our on-call paging/reporting + * + * @param e the exception to check. + * @return A ConfigErrorException with a message with actionable feedback to the user. + */ + protected Optional checkForKnownConfigExceptions(Exception e) { + return Optional.empty(); + } + + @Override + public void createTableIfNotExists(final JdbcDatabase database, final String schemaName, final String tableName) throws SQLException { + try { + database.execute(createTableQuery(database, schemaName, tableName)); + } catch (SQLException e) { + throw checkForKnownConfigExceptions(e).orElseThrow(() -> e); + } + } + + @Override + public String createTableQuery(final JdbcDatabase database, final String schemaName, final String tableName) { + return String.format( + "CREATE TABLE IF NOT EXISTS %s.%s ( \n" + + "%s VARCHAR PRIMARY KEY,\n" + + "%s JSONB,\n" + + "%s TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP\n" + + ");\n", + schemaName, tableName, JavaBaseConstants.COLUMN_NAME_AB_ID, JavaBaseConstants.COLUMN_NAME_DATA, JavaBaseConstants.COLUMN_NAME_EMITTED_AT); + } + + protected void writeBatchToFile(final File tmpFile, final List records) throws Exception { + try (final PrintWriter writer = new PrintWriter(tmpFile, StandardCharsets.UTF_8); + final CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT)) { + for (final AirbyteRecordMessage record : records) { + final var uuid = UUID.randomUUID().toString(); + final var jsonData = Jsons.serialize(formatData(record.getData())); + final var emittedAt = Timestamp.from(Instant.ofEpochMilli(record.getEmittedAt())); + csvPrinter.printRecord(uuid, jsonData, emittedAt); + } + } + } + + protected JsonNode formatData(final JsonNode data) { + return data; + } + + @Override + public String truncateTableQuery(final JdbcDatabase database, final String schemaName, final String tableName) { + return String.format("TRUNCATE TABLE %s.%s;\n", schemaName, tableName); + } + + @Override + public String insertTableQuery(final JdbcDatabase database, final String schemaName, final String srcTableName, final String dstTableName) { + return String.format("INSERT INTO %s.%s SELECT * FROM %s.%s;\n", schemaName, dstTableName, schemaName, srcTableName); + } + + @Override + public void executeTransaction(final JdbcDatabase database, final List queries) throws Exception { + final StringBuilder appendedQueries = new StringBuilder(); + appendedQueries.append("BEGIN;\n"); + for (final String query : queries) { + appendedQueries.append(query); + } + appendedQueries.append("COMMIT;"); + database.execute(appendedQueries.toString()); + } + + @Override + public void dropTableIfExists(final JdbcDatabase database, final String schemaName, final String tableName) throws SQLException { + try { + database.execute(dropTableIfExistsQuery(schemaName, tableName)); + } catch (SQLException e) { + throw checkForKnownConfigExceptions(e).orElseThrow(() -> e); + } + } + + private String dropTableIfExistsQuery(final String schemaName, final String tableName) { + return String.format("DROP TABLE IF EXISTS %s.%s;\n", schemaName, tableName); + } + + @Override + public boolean isSchemaRequired() { + return true; + } + + @Override + public boolean isValidData(final JsonNode data) { + return true; + } + + @Override + public final void insertRecords(final JdbcDatabase database, + final List records, + final String schemaName, + final String tableName) + throws Exception { + dataAdapter.ifPresent(adapter -> records.forEach(airbyteRecordMessage -> adapter.adapt(airbyteRecordMessage.getData()))); + insertRecordsInternal(database, records, schemaName, tableName); + } + + protected abstract void insertRecordsInternal(JdbcDatabase database, + List records, + String schemaName, + String tableName) + throws Exception; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/SqlOperations.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/SqlOperations.java new file mode 100644 index 0000000000000..74d8734a71adb --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/SqlOperations.java @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.util.List; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * todo (cgardens) - is it necessary to expose so much configurability in this interface. review if + * we can narrow the surface area. + * + * SQL queries required for successfully syncing to a destination connector. These operations + * include the ability to: + *

    + *
  • Write - insert records from source connector
  • + *
  • Create - overloaded function but primarily to create tables if they don't exist (e.g. tmp + * tables to "stage" records before finalizing to final table
  • + *
  • Drop - removes a table from the schema
  • + *
  • Insert - move data from one table to another table - usually used for inserting data from tmp + * to final table (aka airbyte_raw)
  • + *
+ */ +public interface SqlOperations { + + Logger LOGGER = LoggerFactory.getLogger(JdbcBufferedConsumerFactory.class); + + /** + * Create a schema with provided name if it does not already exist. + * + * @param database Database that the connector is syncing + * @param schemaName Name of schema. + * @throws Exception exception + */ + void createSchemaIfNotExists(JdbcDatabase database, String schemaName) throws Exception; + + /** + * Denotes whether the schema exists in destination database + * + * @param database Database that the connector is syncing + * @param schemaName Name of schema. + * @return true if the schema exists in destination database, false if it doesn't + */ + default boolean isSchemaExists(final JdbcDatabase database, final String schemaName) throws Exception { + return false; + } + + /** + * Create a table with provided name in provided schema if it does not already exist. + * + * @param database Database that the connector is syncing + * @param schemaName Name of schema + * @param tableName Name of table + * @throws Exception exception + */ + void createTableIfNotExists(JdbcDatabase database, String schemaName, String tableName) throws Exception; + + /** + * Query to create a table with provided name in provided schema if it does not already exist. + * + * @param database Database that the connector is syncing + * @param schemaName Name of schema + * @param tableName Name of table + * @return query + */ + String createTableQuery(JdbcDatabase database, String schemaName, String tableName); + + /** + * Drop the table if it exists. + * + * @param schemaName Name of schema + * @param tableName Name of table + * @throws Exception exception + */ + void dropTableIfExists(JdbcDatabase database, String schemaName, String tableName) throws Exception; + + /** + * Query to remove all records from a table. Assumes the table exists. + * + * @param database Database that the connector is syncing + * @param schemaName Name of schema + * @param tableName Name of table + * @return Query + */ + String truncateTableQuery(JdbcDatabase database, String schemaName, String tableName); + + /** + * Insert records into table. Assumes the table exists. + * + * @param database Database that the connector is syncing + * @param records Records to insert. + * @param schemaName Name of schema + * @param tableName Name of table + * @throws Exception exception + */ + void insertRecords(JdbcDatabase database, List records, String schemaName, String tableName) throws Exception; + + /** + * Query to insert all records from source table to destination table. Both tables must be in the + * specified schema. Assumes both table exist. + * + *

+ * NOTE: this is an append-only operation meaning that data can be duplicated + *

+ * + * @param database Database that the connector is syncing + * @param schemaName Name of schema + * @param sourceTableName Name of source table + * @param destinationTableName Name of destination table + * @return SQL Query string + */ + String insertTableQuery(JdbcDatabase database, String schemaName, String sourceTableName, String destinationTableName); + + /** + * Given an arbitrary number of queries, execute a transaction. + * + * @param database Database that the connector is syncing + * @param queries Queries to execute + * @throws Exception exception + */ + void executeTransaction(JdbcDatabase database, List queries) throws Exception; + + /** + * Check if the data record is valid and ok to be written to destination + */ + boolean isValidData(final JsonNode data); + + /** + * Denotes whether the destination has the concept of schema or not + * + * @return true if the destination supports schema (ex: Postgres), false if it doesn't(MySQL) + */ + boolean isSchemaRequired(); + + /** + * The method is responsible for executing some specific DB Engine logic in onClose method. We can + * override this method to execute specific logic e.g. to handle any necessary migrations in the + * destination, etc. + *

+ * In next example you can see how migration from VARCHAR to SUPER column is handled for the + * Redshift destination: + * + * @param database - Database that the connector is interacting with + * @param writeConfigs - schemas and tables (streams) will be discovered + * @see io.airbyte.integrations.destination.redshift.RedshiftSqlOperations#onDestinationCloseOperations + */ + default void onDestinationCloseOperations(final JdbcDatabase database, final List writeConfigs) { + // do nothing + LOGGER.info("No onDestinationCloseOperations required for this destination."); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/SqlOperationsUtils.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/SqlOperationsUtils.java new file mode 100644 index 0000000000000..bb6625e21979a --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/SqlOperationsUtils.java @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterables; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.time.Instant; +import java.util.List; +import java.util.UUID; +import java.util.function.Supplier; + +public class SqlOperationsUtils { + + /** + * Inserts "raw" records in a single query. The purpose of helper to abstract away database-specific + * SQL syntax from this query. + * + * @param insertQueryComponent the first line of the query e.g. INSERT INTO public.users (ab_id, + * data, emitted_at) + * @param recordQueryComponent query template for a full record e.g. (?, ?::jsonb ?) + * @param jdbcDatabase jdbc database + * @param records records to write + * @throws SQLException exception + */ + public static void insertRawRecordsInSingleQuery(final String insertQueryComponent, + final String recordQueryComponent, + final JdbcDatabase jdbcDatabase, + final List records) + throws SQLException { + insertRawRecordsInSingleQuery(insertQueryComponent, recordQueryComponent, jdbcDatabase, records, UUID::randomUUID, true); + } + + /** + * Inserts "raw" records in a single query. The purpose of helper to abstract away database-specific + * SQL syntax from this query. + * + * This version does not add a semicolon at the end of the INSERT statement. + * + * @param insertQueryComponent the first line of the query e.g. INSERT INTO public.users (ab_id, + * data, emitted_at) + * @param recordQueryComponent query template for a full record e.g. (?, ?::jsonb ?) + * @param jdbcDatabase jdbc database + * @param records records to write + * @throws SQLException exception + */ + public static void insertRawRecordsInSingleQueryNoSem(final String insertQueryComponent, + final String recordQueryComponent, + final JdbcDatabase jdbcDatabase, + final List records) + throws SQLException { + insertRawRecordsInSingleQuery(insertQueryComponent, recordQueryComponent, jdbcDatabase, records, UUID::randomUUID, false); + } + + @VisibleForTesting + static void insertRawRecordsInSingleQuery(final String insertQueryComponent, + final String recordQueryComponent, + final JdbcDatabase jdbcDatabase, + final List records, + final Supplier uuidSupplier, + final boolean sem) + throws SQLException { + if (records.isEmpty()) { + return; + } + + jdbcDatabase.execute(connection -> { + + // Strategy: We want to use PreparedStatement because it handles binding values to the SQL query + // (e.g. handling formatting timestamps). A PreparedStatement statement is created by supplying the + // full SQL string at creation time. Then subsequently specifying which values are bound to the + // string. Thus there will be two loops below. + // 1) Loop over records to build the full string. + // 2) Loop over the records and bind the appropriate values to the string. + // We also partition the query to run on 10k records at a time, since some DBs set a max limit on + // how many records can be inserted at once + // TODO(sherif) this should use a smarter, destination-aware partitioning scheme instead of 10k by + // default + for (List partition : Iterables.partition(records, 10_000)) { + final StringBuilder sql = new StringBuilder(insertQueryComponent); + partition.forEach(r -> sql.append(recordQueryComponent)); + final String s = sql.toString(); + final String s1 = s.substring(0, s.length() - 2) + (sem ? ";" : ""); + + try (final PreparedStatement statement = connection.prepareStatement(s1)) { + // second loop: bind values to the SQL string. + int i = 1; + for (final AirbyteRecordMessage message : partition) { + // 1-indexed + statement.setString(i, uuidSupplier.get().toString()); + statement.setString(i + 1, Jsons.serialize(message.getData())); + statement.setTimestamp(i + 2, Timestamp.from(Instant.ofEpochMilli(message.getEmittedAt()))); + i += 3; + } + + statement.execute(); + } + } + }); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/StagingFilenameGenerator.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/StagingFilenameGenerator.java new file mode 100644 index 0000000000000..e046d71064809 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/StagingFilenameGenerator.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import static io.airbyte.integrations.destination.jdbc.constants.GlobalDataSizeConstants.MAX_FILE_SIZE; + +/** + * The staging file is uploaded to cloud storage in multiple parts. This class keeps track of the + * filename, and returns a new one when the old file has had enough parts. + */ +public class StagingFilenameGenerator { + + private final String streamName; + + // the file suffix will change after the max number of file + // parts have been generated for the current suffix; + // its value starts from 0. + private int currentFileSuffix = 0; + // the number of parts that have been generated for the current + // file suffix; its value range will be [1, maxPartsPerFile] + private int currentFileSuffixPartCount = 0; + + // This variable is responsible to set the size of chunks size (In MB). After chunks created in + // S3 or GCS they will be uploaded to Snowflake or Redshift. These service have some limitations for + // the uploading file. + // So we make the calculation to determine how many parts we can put to the single chunk file. + private final long iterations; + + /** + * @param streamName - the name of table will be processed + * @param chunkSize - the number of optimal chunk size for the service. + */ + public StagingFilenameGenerator(final String streamName, final long chunkSize) { + this.streamName = streamName; + this.iterations = MAX_FILE_SIZE / chunkSize; + } + + /** + * This method is assumed to be called whenever one part of a file is going to be created. The + * currentFileSuffix increments from 0. The currentFileSuffixPartCount cycles from 1 to + * maxPartsPerFile. + */ + public String getStagingFilename() { + if (currentFileSuffixPartCount < iterations) { + // when the number of parts for the file has not reached the max, + // keep using the same file (i.e. keep the suffix) + currentFileSuffixPartCount += 1; + } else { + // otherwise, reset the part counter, and use a different file + // (i.e. update the suffix) + currentFileSuffix += 1; + currentFileSuffixPartCount = 1; + } + return String.format("%s_%05d", streamName, currentFileSuffix); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/WriteConfig.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/WriteConfig.java new file mode 100644 index 0000000000000..5c3876eb601c8 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/WriteConfig.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; + +/** + * Write configuration POJO (plain old java object) for all destinations extending + * {@link AbstractJdbcDestination}. + */ +public class WriteConfig { + + private final String streamName; + private final String namespace; + private final String outputSchemaName; + private final String tmpTableName; + private final String outputTableName; + private final DestinationSyncMode syncMode; + private final DateTime writeDatetime; + + public WriteConfig(final String streamName, + final String namespace, + final String outputSchemaName, + final String tmpTableName, + final String outputTableName, + final DestinationSyncMode syncMode) { + this(streamName, namespace, outputSchemaName, tmpTableName, outputTableName, syncMode, DateTime.now(DateTimeZone.UTC)); + } + + public WriteConfig(final String streamName, + final String namespace, + final String outputSchemaName, + final String tmpTableName, + final String outputTableName, + final DestinationSyncMode syncMode, + final DateTime writeDatetime) { + this.streamName = streamName; + this.namespace = namespace; + this.outputSchemaName = outputSchemaName; + this.tmpTableName = tmpTableName; + this.outputTableName = outputTableName; + this.syncMode = syncMode; + this.writeDatetime = writeDatetime; + } + + public String getStreamName() { + return streamName; + } + + public String getNamespace() { + return namespace; + } + + public String getTmpTableName() { + return tmpTableName; + } + + public String getOutputSchemaName() { + return outputSchemaName; + } + + public String getOutputTableName() { + return outputTableName; + } + + public DestinationSyncMode getSyncMode() { + return syncMode; + } + + public DateTime getWriteDatetime() { + return writeDatetime; + } + + @Override + public String toString() { + return "WriteConfig{" + + "streamName=" + streamName + + ", namespace=" + namespace + + ", outputSchemaName=" + outputSchemaName + + ", tmpTableName=" + tmpTableName + + ", outputTableName=" + outputTableName + + ", syncMode=" + syncMode + + '}'; + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/constants/GlobalDataSizeConstants.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/constants/GlobalDataSizeConstants.java new file mode 100644 index 0000000000000..eb20778ab000b --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/constants/GlobalDataSizeConstants.java @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.constants; + +import io.aesy.datasize.ByteUnit.IEC; +import io.aesy.datasize.DataSize; + +public interface GlobalDataSizeConstants { + + /** 25 MB to BYTES as comparison will be done in BYTES */ + int DEFAULT_MAX_BATCH_SIZE_BYTES = DataSize.of(25L, IEC.MEBIBYTE).toUnit(IEC.BYTE).getValue().intValue(); + /** + * This constant determines the max possible size of file(e.g. 100 MB / 25 megabytes ≈ 4 chunks of + * file) see StagingFilenameGenerator.java:28 + */ + long MAX_FILE_SIZE = DataSize.of(100L, IEC.MEBIBYTE).toUnit(IEC.BYTE).getValue().longValue(); + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/CopyConsumerFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/CopyConsumerFactory.java new file mode 100644 index 0000000000000..f9dfa14e58f17 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/CopyConsumerFactory.java @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy; + +import static io.airbyte.integrations.destination.jdbc.constants.GlobalDataSizeConstants.DEFAULT_MAX_BATCH_SIZE_BYTES; + +import io.airbyte.db.factory.DataSourceFactory; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.buffered_stream_consumer.BufferedStreamConsumer; +import io.airbyte.integrations.destination.buffered_stream_consumer.CheckAndRemoveRecordWriter; +import io.airbyte.integrations.destination.buffered_stream_consumer.OnCloseFunction; +import io.airbyte.integrations.destination.buffered_stream_consumer.OnStartFunction; +import io.airbyte.integrations.destination.buffered_stream_consumer.RecordWriter; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.record_buffer.InMemoryRecordBufferingStrategy; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.function.Consumer; +import javax.sql.DataSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class CopyConsumerFactory { + + private static final Logger LOGGER = LoggerFactory.getLogger(CopyConsumerFactory.class); + + public static AirbyteMessageConsumer create(final Consumer outputRecordCollector, + final DataSource dataSource, + final JdbcDatabase database, + final SqlOperations sqlOperations, + final StandardNameTransformer namingResolver, + final T config, + final ConfiguredAirbyteCatalog catalog, + final StreamCopierFactory streamCopierFactory, + final String defaultSchema) { + final Map pairToCopier = createWriteConfigs( + namingResolver, + config, + catalog, + streamCopierFactory, + defaultSchema, + database, + sqlOperations); + + final Map pairToIgnoredRecordCount = new HashMap<>(); + return new BufferedStreamConsumer( + outputRecordCollector, + onStartFunction(pairToIgnoredRecordCount), + new InMemoryRecordBufferingStrategy( + recordWriterFunction(pairToCopier, sqlOperations, pairToIgnoredRecordCount), + removeStagingFilePrinter(pairToCopier), + DEFAULT_MAX_BATCH_SIZE_BYTES), + onCloseFunction(pairToCopier, database, sqlOperations, pairToIgnoredRecordCount, dataSource), + catalog, + sqlOperations::isValidData); + } + + private static Map createWriteConfigs(final StandardNameTransformer namingResolver, + final T config, + final ConfiguredAirbyteCatalog catalog, + final StreamCopierFactory streamCopierFactory, + final String defaultSchema, + final JdbcDatabase database, + final SqlOperations sqlOperations) { + final Map pairToCopier = new HashMap<>(); + final String stagingFolder = UUID.randomUUID().toString(); + for (final var configuredStream : catalog.getStreams()) { + final var stream = configuredStream.getStream(); + final var pair = AirbyteStreamNameNamespacePair.fromAirbyteStream(stream); + final var copier = streamCopierFactory.create(defaultSchema, config, stagingFolder, configuredStream, namingResolver, database, sqlOperations); + + pairToCopier.put(pair, copier); + } + + return pairToCopier; + } + + private static OnStartFunction onStartFunction(final Map pairToIgnoredRecordCount) { + return pairToIgnoredRecordCount::clear; + } + + private static RecordWriter recordWriterFunction(final Map pairToCopier, + final SqlOperations sqlOperations, + final Map pairToIgnoredRecordCount) { + return (AirbyteStreamNameNamespacePair pair, List records) -> { + final var fileName = pairToCopier.get(pair).prepareStagingFile(); + for (final AirbyteRecordMessage recordMessage : records) { + final var id = UUID.randomUUID(); + if (sqlOperations.isValidData(recordMessage.getData())) { + // TODO Truncate json data instead of throwing whole record away? + // or should we upload it into a special rejected record folder in s3 instead? + pairToCopier.get(pair).write(id, recordMessage, fileName); + } else { + pairToIgnoredRecordCount.put(pair, pairToIgnoredRecordCount.getOrDefault(pair, 0L) + 1L); + } + } + }; + } + + private static CheckAndRemoveRecordWriter removeStagingFilePrinter(final Map pairToCopier) { + return (AirbyteStreamNameNamespacePair pair, String stagingFileName) -> { + final String currentFileName = pairToCopier.get(pair).getCurrentFile(); + if (stagingFileName != null && currentFileName != null && !stagingFileName.equals(currentFileName)) { + pairToCopier.get(pair).closeNonCurrentStagingFileWriters(); + } + return currentFileName; + }; + } + + private static OnCloseFunction onCloseFunction(final Map pairToCopier, + final JdbcDatabase database, + final SqlOperations sqlOperations, + final Map pairToIgnoredRecordCount, + final DataSource dataSource) { + return (hasFailed) -> { + pairToIgnoredRecordCount + .forEach((pair, count) -> LOGGER.warn("A total of {} record(s) of data from stream {} were invalid and were ignored.", count, pair)); + closeAsOneTransaction(pairToCopier, hasFailed, database, sqlOperations, dataSource); + }; + } + + private static void closeAsOneTransaction(final Map pairToCopier, + boolean hasFailed, + final JdbcDatabase db, + final SqlOperations sqlOperations, + final DataSource dataSource) + throws Exception { + Exception firstException = null; + final List streamCopiers = new ArrayList<>(pairToCopier.values()); + try { + final List queries = new ArrayList<>(); + for (final var copier : streamCopiers) { + try { + copier.closeStagingUploader(hasFailed); + if (!hasFailed) { + copier.createDestinationSchema(); + copier.createTemporaryTable(); + copier.copyStagingFileToTemporaryTable(); + final var destTableName = copier.createDestinationTable(); + final var mergeQuery = copier.generateMergeStatement(destTableName); + queries.add(mergeQuery); + } + } catch (final Exception e) { + final String message = String.format("Failed to finalize copy to temp table due to: %s", e); + LOGGER.error(message); + hasFailed = true; + if (firstException == null) { + firstException = e; + } + } + } + if (!hasFailed) { + sqlOperations.executeTransaction(db, queries); + } + } finally { + for (final var copier : streamCopiers) { + copier.removeFileAndDropTmpTable(); + } + + DataSourceFactory.close(dataSource); + } + if (firstException != null) { + throw firstException; + } + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/CopyDestination.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/CopyDestination.java new file mode 100644 index 0000000000000..18df4fa5e36be --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/CopyDestination.java @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy; + +import static io.airbyte.integrations.base.errors.messages.ErrorMessage.getErrorMessage; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.exceptions.ConnectionErrorException; +import io.airbyte.db.factory.DataSourceFactory; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.BaseConnector; +import io.airbyte.integrations.base.AirbyteTraceMessageUtility; +import io.airbyte.integrations.base.Destination; +import io.airbyte.integrations.destination.NamingConventionTransformer; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.AbstractJdbcDestination; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import javax.sql.DataSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class CopyDestination extends BaseConnector implements Destination { + + private static final Logger LOGGER = LoggerFactory.getLogger(CopyDestination.class); + + /** + * The default database schema field in the destination config is "schema". To change it, pass the + * field name to the constructor. + */ + private String schemaFieldName = "schema"; + + public CopyDestination() {} + + public CopyDestination(final String schemaFieldName) { + this.schemaFieldName = schemaFieldName; + } + + /** + * A self contained method for writing a file to the persistence for testing. This method should try + * to clean up after itself by deleting the file it creates. + */ + public abstract void checkPersistence(JsonNode config) throws Exception; + + public abstract StandardNameTransformer getNameTransformer(); + + public abstract DataSource getDataSource(JsonNode config); + + public abstract JdbcDatabase getDatabase(DataSource dataSource); + + public abstract SqlOperations getSqlOperations(); + + @Override + public AirbyteConnectionStatus check(final JsonNode config) { + try { + checkPersistence(config); + } catch (final Exception e) { + LOGGER.error("Exception attempting to access the staging persistence: ", e); + return new AirbyteConnectionStatus() + .withStatus(AirbyteConnectionStatus.Status.FAILED) + .withMessage("Could not connect to the staging persistence with the provided configuration. \n" + e.getMessage()); + } + + final DataSource dataSource = getDataSource(config); + + try { + final JdbcDatabase database = getDatabase(dataSource); + final var nameTransformer = getNameTransformer(); + final var outputSchema = nameTransformer.convertStreamName(config.get(schemaFieldName).asText()); + performCreateInsertTestOnDestination(outputSchema, database, nameTransformer); + + return new AirbyteConnectionStatus().withStatus(AirbyteConnectionStatus.Status.SUCCEEDED); + } catch (final ConnectionErrorException ex) { + LOGGER.info("Exception while checking connection: ", ex); + final String message = getErrorMessage(ex.getStateCode(), ex.getErrorCode(), ex.getExceptionMessage(), ex); + AirbyteTraceMessageUtility.emitConfigErrorTrace(ex, message); + return new AirbyteConnectionStatus() + .withStatus(AirbyteConnectionStatus.Status.FAILED) + .withMessage(message); + } catch (final Exception e) { + LOGGER.error("Exception attempting to connect to the warehouse: ", e); + return new AirbyteConnectionStatus() + .withStatus(AirbyteConnectionStatus.Status.FAILED) + .withMessage("Could not connect to the warehouse with the provided configuration. \n" + e.getMessage()); + } finally { + try { + DataSourceFactory.close(dataSource); + } catch (final Exception e) { + LOGGER.warn("Unable to close data source.", e); + } + } + } + + protected void performCreateInsertTestOnDestination(final String outputSchema, + final JdbcDatabase database, + final NamingConventionTransformer nameTransformer) + throws Exception { + AbstractJdbcDestination.attemptTableOperations(outputSchema, database, nameTransformer, getSqlOperations(), true); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/StreamCopier.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/StreamCopier.java new file mode 100644 index 0000000000000..a885a42f39b98 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/StreamCopier.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy; + +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.util.UUID; + +/** + * StreamCopier is responsible for writing to a staging persistence and providing methods to remove + * the staged data. + */ +public interface StreamCopier { + + /** + * Writes a value to a staging file for the stream. + */ + void write(UUID id, AirbyteRecordMessage recordMessage, String fileName) throws Exception; + + /** + * Closes the writer for the stream to the current staging file. The staging file must be of a + * certain size specified in GlobalDataSizeConstants + one more buffer. The writer for the stream + * will close with a note that no errors were found. + */ + void closeNonCurrentStagingFileWriters() throws Exception; + + /** + * Closes the writer for the stream to the staging persistence. This method should block until all + * buffered data has been written to the persistence. + */ + void closeStagingUploader(boolean hasFailed) throws Exception; + + /** + * Creates a temporary table in the target database. + */ + void createTemporaryTable() throws Exception; + + /** + * Copies the staging file to the temporary table. This method should block until the copy/upload + * has completed. + */ + void copyStagingFileToTemporaryTable() throws Exception; + + /** + * Creates the destination schema if it does not already exist. + */ + void createDestinationSchema() throws Exception; + + /** + * Creates the destination table if it does not already exist. + * + * @return the name of the destination table + */ + String createDestinationTable() throws Exception; + + /** + * Generates a merge SQL statement from the temporary table to the final table. + */ + String generateMergeStatement(String destTableName) throws Exception; + + /** + * Cleans up the copier by removing the staging file and dropping the temporary table after + * completion or failure. + */ + void removeFileAndDropTmpTable() throws Exception; + + /** + * Creates the staging file and all the necessary items to write data to this file. + * + * @return A string that unqiuely identifies the file. E.g. the filename, or a unique suffix that is + * appended to a shared filename prefix + */ + String prepareStagingFile(); + + /** + * @return current staging file name + */ + String getCurrentFile(); + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/StreamCopierFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/StreamCopierFactory.java new file mode 100644 index 0000000000000..6b2247ea8e342 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/StreamCopierFactory.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy; + +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; + +public interface StreamCopierFactory { + + StreamCopier create(String configuredSchema, + T config, + String stagingFolder, + ConfiguredAirbyteStream configuredStream, + StandardNameTransformer nameTransformer, + JdbcDatabase db, + SqlOperations sqlOperations); + + static String getSchema(final String namespace, final String configuredSchema, final StandardNameTransformer nameTransformer) { + if (namespace != null) { + return nameTransformer.convertStreamName(namespace); + } else { + return nameTransformer.convertStreamName(configuredSchema); + } + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/SwitchingDestination.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/SwitchingDestination.java new file mode 100644 index 0000000000000..8398c5adeafaa --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/SwitchingDestination.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.base.Preconditions; +import io.airbyte.integrations.BaseConnector; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.base.Destination; +import io.airbyte.protocol.models.v0.AirbyteConnectionStatus; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.function.Consumer; +import java.util.function.Function; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Multiple configs may allow you to sync data to the destination in multiple ways. + * + * One primary example is that the default behavior for some DB-based destinations may use + * INSERT-based destinations while (given additional credentials) it may be able to sync data using + * a file copied to a staging location. + * + * This class exists to make it easy to define a destination in terms of multiple other destination + * implementations, switching between them based on the config provided. + */ +public class SwitchingDestination> extends BaseConnector implements Destination { + + private static final Logger LOGGER = LoggerFactory.getLogger(SwitchingDestination.class); + + private final Function configToType; + private final Map typeToDestination; + + public SwitchingDestination(final Class enumClass, final Function configToType, final Map typeToDestination) { + final Set allEnumConstants = new HashSet<>(Arrays.asList(enumClass.getEnumConstants())); + final Set supportedEnumConstants = typeToDestination.keySet(); + + // check that it isn't possible for configToType to produce something we can't handle + Preconditions.checkArgument(allEnumConstants.equals(supportedEnumConstants)); + + this.configToType = configToType; + this.typeToDestination = typeToDestination; + } + + @Override + public AirbyteConnectionStatus check(final JsonNode config) throws Exception { + final T destinationType = configToType.apply(config); + LOGGER.info("Using destination type: " + destinationType.name()); + return typeToDestination.get(destinationType).check(config); + } + + @Override + public AirbyteMessageConsumer getConsumer(final JsonNode config, + final ConfiguredAirbyteCatalog catalog, + final Consumer outputRecordCollector) + throws Exception { + final T destinationType = configToType.apply(config); + LOGGER.info("Using destination type: " + destinationType.name()); + return typeToDestination.get(destinationType).getConsumer(config, catalog, outputRecordCollector); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageConfig.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageConfig.java new file mode 100644 index 0000000000000..bc72328cd23bb --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageConfig.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.azure; + +import com.fasterxml.jackson.databind.JsonNode; +import java.util.Locale; + +public class AzureBlobStorageConfig { + + private static final String DEFAULT_STORAGE_ENDPOINT_DOMAIN_NAME = "blob.core.windows.net"; + + private final String endpointDomainName; + private final String accountName; + private final String containerName; + private final String sasToken; + + public AzureBlobStorageConfig( + String endpointDomainName, + String accountName, + String containerName, + String sasToken) { + this.endpointDomainName = endpointDomainName; + this.accountName = accountName; + this.containerName = containerName; + this.sasToken = sasToken; + } + + public String getEndpointDomainName() { + return endpointDomainName == null ? DEFAULT_STORAGE_ENDPOINT_DOMAIN_NAME : endpointDomainName; + } + + public String getAccountName() { + return accountName; + } + + public String getContainerName() { + return containerName; + } + + public String getSasToken() { + return sasToken; + } + + public String getEndpointUrl() { + return String.format(Locale.ROOT, "https://%s.%s", getAccountName(), getEndpointDomainName()); + } + + public static AzureBlobStorageConfig getAzureBlobConfig(JsonNode config) { + + return new AzureBlobStorageConfig( + config.get("azure_blob_storage_endpoint_domain_name") == null ? DEFAULT_STORAGE_ENDPOINT_DOMAIN_NAME + : config.get("azure_blob_storage_endpoint_domain_name").asText(), + config.get("azure_blob_storage_account_name").asText(), + config.get("azure_blob_storage_container_name").asText(), + config.get("azure_blob_storage_sas_token").asText()); + + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageStreamCopier.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageStreamCopier.java new file mode 100644 index 0000000000000..b16245920981d --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageStreamCopier.java @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.azure; + +import com.azure.storage.blob.BlobContainerClient; +import com.azure.storage.blob.specialized.AppendBlobClient; +import com.azure.storage.blob.specialized.SpecializedBlobClientBuilder; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.jdbc.StagingFilenameGenerator; +import io.airbyte.integrations.destination.jdbc.constants.GlobalDataSizeConstants; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopier; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import java.io.BufferedOutputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.time.Instant; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.UUID; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class AzureBlobStorageStreamCopier implements StreamCopier { + + private static final Logger LOGGER = LoggerFactory.getLogger(AzureBlobStorageStreamCopier.class); + protected StagingFilenameGenerator filenameGenerator; + protected final String stagingFolder; + protected final Set azureStagingFiles = new HashSet<>(); + protected final AzureBlobStorageConfig azureBlobConfig; + protected final String tmpTableName; + protected final String schemaName; + protected final String streamName; + protected final JdbcDatabase db; + protected final Set activeStagingWriterFileNames = new HashSet<>(); + private final StandardNameTransformer nameTransformer; + private final SqlOperations sqlOperations; + private final DestinationSyncMode destSyncMode; + private final SpecializedBlobClientBuilder specializedBlobClientBuilder; + private final HashMap csvPrinters = new HashMap<>(); + private final HashMap blobClients = new HashMap<>(); + private String currentFile; + + public AzureBlobStorageStreamCopier(final String stagingFolder, + final DestinationSyncMode destSyncMode, + final String schema, + final String streamName, + final SpecializedBlobClientBuilder specializedBlobClientBuilder, + final JdbcDatabase db, + final AzureBlobStorageConfig azureBlobConfig, + final StandardNameTransformer nameTransformer, + final SqlOperations sqlOperations) { + this.stagingFolder = stagingFolder; + this.destSyncMode = destSyncMode; + this.schemaName = schema; + this.streamName = streamName; + this.db = db; + this.nameTransformer = nameTransformer; + this.sqlOperations = sqlOperations; + this.tmpTableName = nameTransformer.getTmpTableName(streamName); + this.specializedBlobClientBuilder = specializedBlobClientBuilder; + this.azureBlobConfig = azureBlobConfig; + this.filenameGenerator = new StagingFilenameGenerator(streamName, GlobalDataSizeConstants.DEFAULT_MAX_BATCH_SIZE_BYTES); + } + + public static void attemptAzureBlobWriteAndDelete(final AzureBlobStorageConfig config) { + AppendBlobClient appendBlobClient = null; + try { + appendBlobClient = new SpecializedBlobClientBuilder() + .endpoint(config.getEndpointUrl()) + .sasToken(config.getSasToken()) + .containerName(config.getContainerName()) + .blobName("testAzureBlob" + UUID.randomUUID()) + .buildAppendBlobClient(); + + final BlobContainerClient containerClient = getBlobContainerClient(appendBlobClient); + writeTestDataIntoBlob(appendBlobClient); + listCreatedBlob(containerClient); + } finally { + if (appendBlobClient != null && appendBlobClient.exists()) { + LOGGER.info("Deleting blob: " + appendBlobClient.getBlobName()); + appendBlobClient.delete(); + } + } + + } + + private static void listCreatedBlob(final BlobContainerClient containerClient) { + containerClient.listBlobs().forEach(blobItem -> LOGGER.info("Blob name: " + blobItem.getName() + "Snapshot: " + blobItem.getSnapshot())); + } + + private static void writeTestDataIntoBlob(final AppendBlobClient appendBlobClient) { + final String test = "test_data"; + LOGGER.info("Writing test data to Azure Blob storage: " + test); + final InputStream dataStream = new ByteArrayInputStream(test.getBytes(StandardCharsets.UTF_8)); + + final Integer blobCommittedBlockCount = appendBlobClient.appendBlock(dataStream, test.length()) + .getBlobCommittedBlockCount(); + + LOGGER.info("blobCommittedBlockCount: " + blobCommittedBlockCount); + } + + private static BlobContainerClient getBlobContainerClient(final AppendBlobClient appendBlobClient) { + final BlobContainerClient containerClient = appendBlobClient.getContainerClient(); + if (!containerClient.exists()) { + containerClient.create(); + } + + if (!appendBlobClient.exists()) { + appendBlobClient.create(); + LOGGER.info("blobContainerClient created"); + } else { + LOGGER.info("blobContainerClient already exists"); + } + return containerClient; + } + + public Set getAzureStagingFiles() { + return azureStagingFiles; + } + + @Override + public void write(final UUID id, final AirbyteRecordMessage recordMessage, final String azureFileName) throws Exception { + if (csvPrinters.containsKey(azureFileName)) { + csvPrinters.get(azureFileName).printRecord(id, + Jsons.serialize(recordMessage.getData()), + Timestamp.from(Instant.ofEpochMilli(recordMessage.getEmittedAt()))); + } + } + + @Override + public String prepareStagingFile() { + currentFile = prepareAzureStagingFile(); + if (!azureStagingFiles.contains(currentFile)) { + + azureStagingFiles.add(currentFile); + activeStagingWriterFileNames.add(currentFile); + + final AppendBlobClient appendBlobClient = specializedBlobClientBuilder + .blobName(currentFile) + .buildAppendBlobClient(); + blobClients.put(currentFile, appendBlobClient); + appendBlobClient.create(true); + + final BufferedOutputStream bufferedOutputStream = + new BufferedOutputStream(appendBlobClient.getBlobOutputStream(), Math.toIntExact(GlobalDataSizeConstants.MAX_FILE_SIZE)); + final var writer = new PrintWriter(bufferedOutputStream, true, StandardCharsets.UTF_8); + try { + csvPrinters.put(currentFile, new CSVPrinter(writer, CSVFormat.DEFAULT)); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + return currentFile; + } + + private String prepareAzureStagingFile() { + return String.join("/", stagingFolder, schemaName, filenameGenerator.getStagingFilename()); + } + + @Override + public void closeStagingUploader(final boolean hasFailed) throws Exception { + LOGGER.info("Uploading remaining data for {} stream.", streamName); + for (final var csvPrinter : csvPrinters.values()) { + csvPrinter.close(); + } + LOGGER.info("All data for {} stream uploaded.", streamName); + } + + @Override + public void createDestinationSchema() throws Exception { + LOGGER.info("Creating schema in destination if it doesn't exist: {}", schemaName); + sqlOperations.createSchemaIfNotExists(db, schemaName); + } + + @Override + public void createTemporaryTable() throws Exception { + LOGGER.info("Preparing tmp table in destination for stream: {}, schema: {}, tmp table name: {}.", streamName, schemaName, tmpTableName); + sqlOperations.createTableIfNotExists(db, schemaName, tmpTableName); + } + + @Override + public void copyStagingFileToTemporaryTable() throws Exception { + LOGGER.info("Starting copy to tmp table: {} in destination for stream: {}, schema: {}.", tmpTableName, streamName, schemaName); + for (final var azureStagingFile : azureStagingFiles) { + copyAzureBlobCsvFileIntoTable(db, getFullAzurePath(azureStagingFile), schemaName, tmpTableName, azureBlobConfig); + } + LOGGER.info("Copy to tmp table {} in destination for stream {} complete.", tmpTableName, streamName); + } + + private String getFullAzurePath(final String azureStagingFile) { + return "azure://" + azureBlobConfig.getAccountName() + "." + azureBlobConfig.getEndpointDomainName() + + "/" + azureBlobConfig.getContainerName() + "/" + azureStagingFile; + } + + @Override + public String createDestinationTable() throws Exception { + final var destTableName = nameTransformer.getRawTableName(streamName); + LOGGER.info("Preparing table {} in destination.", destTableName); + sqlOperations.createTableIfNotExists(db, schemaName, destTableName); + LOGGER.info("Table {} in destination prepared.", tmpTableName); + + return destTableName; + } + + @Override + public String generateMergeStatement(final String destTableName) throws Exception { + LOGGER.info("Preparing to merge tmp table {} to dest table: {}, schema: {}, in destination.", tmpTableName, destTableName, schemaName); + final var queries = new StringBuilder(); + if (destSyncMode.equals(DestinationSyncMode.OVERWRITE)) { + queries.append(sqlOperations.truncateTableQuery(db, schemaName, destTableName)); + LOGGER.info("Destination OVERWRITE mode detected. Dest table: {}, schema: {}, truncated.", destTableName, schemaName); + } + queries.append(sqlOperations.insertTableQuery(db, schemaName, tmpTableName, destTableName)); + return queries.toString(); + } + + @Override + public void removeFileAndDropTmpTable() throws Exception { + LOGGER.info("Begin cleaning azure blob staging files."); + for (final AppendBlobClient appendBlobClient : blobClients.values()) { + appendBlobClient.delete(); + } + LOGGER.info("Azure Blob staging files cleaned."); + + LOGGER.info("Begin cleaning {} tmp table in destination.", tmpTableName); + sqlOperations.dropTableIfExists(db, schemaName, tmpTableName); + LOGGER.info("{} tmp table in destination cleaned.", tmpTableName); + } + + @Override + public void closeNonCurrentStagingFileWriters() throws Exception { + LOGGER.info("Begin closing non current file writers"); + final Set removedKeys = new HashSet<>(); + for (final String key : activeStagingWriterFileNames) { + if (!key.equals(currentFile)) { + csvPrinters.get(key).close(); + csvPrinters.remove(key); + removedKeys.add(key); + } + } + activeStagingWriterFileNames.removeAll(removedKeys); + } + + @Override + public String getCurrentFile() { + return currentFile; + } + + @VisibleForTesting + public String getTmpTableName() { + return tmpTableName; + } + + public abstract void copyAzureBlobCsvFileIntoTable(JdbcDatabase database, + String snowflakeAzureExternalStageName, + String schema, + String tableName, + AzureBlobStorageConfig config) + throws SQLException; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageStreamCopierFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageStreamCopierFactory.java new file mode 100644 index 0000000000000..dbed77a0ea12a --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/azure/AzureBlobStorageStreamCopierFactory.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.azure; + +import com.azure.storage.blob.specialized.SpecializedBlobClientBuilder; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopier; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopierFactory; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; + +public abstract class AzureBlobStorageStreamCopierFactory implements StreamCopierFactory { + + @Override + public StreamCopier create(String configuredSchema, + AzureBlobStorageConfig azureBlobConfig, + String stagingFolder, + ConfiguredAirbyteStream configuredStream, + StandardNameTransformer nameTransformer, + JdbcDatabase db, + SqlOperations sqlOperations) { + try { + AirbyteStream stream = configuredStream.getStream(); + DestinationSyncMode syncMode = configuredStream.getDestinationSyncMode(); + String schema = StreamCopierFactory.getSchema(stream.getNamespace(), configuredSchema, nameTransformer); + String streamName = stream.getName(); + + final SpecializedBlobClientBuilder specializedBlobClientBuilder = new SpecializedBlobClientBuilder() + .endpoint(azureBlobConfig.getEndpointUrl()) + .sasToken(azureBlobConfig.getSasToken()) + .containerName(azureBlobConfig.getContainerName()); + + return create(stagingFolder, syncMode, schema, streamName, specializedBlobClientBuilder, db, azureBlobConfig, nameTransformer, sqlOperations); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public abstract StreamCopier create(String stagingFolder, + DestinationSyncMode syncMode, + String schema, + String streamName, + SpecializedBlobClientBuilder specializedBlobClientBuilder, + JdbcDatabase db, + AzureBlobStorageConfig azureBlobConfig, + StandardNameTransformer nameTransformer, + SqlOperations sqlOperations) + throws Exception; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsConfig.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsConfig.java new file mode 100644 index 0000000000000..899458b991370 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsConfig.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.gcs; + +import com.fasterxml.jackson.databind.JsonNode; + +public class GcsConfig { + + private final String projectId; + private final String bucketName; + private final String credentialsJson; + + public GcsConfig(final String projectId, final String bucketName, final String credentialsJson) { + this.projectId = projectId; + this.bucketName = bucketName; + this.credentialsJson = credentialsJson; + } + + public String getProjectId() { + return projectId; + } + + public String getBucketName() { + return bucketName; + } + + public String getCredentialsJson() { + return credentialsJson; + } + + public static GcsConfig getGcsConfig(final JsonNode config) { + return new GcsConfig( + config.get("loading_method").get("project_id").asText(), + config.get("loading_method").get("bucket_name").asText(), + config.get("loading_method").get("credentials_json").asText()); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsStreamCopier.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsStreamCopier.java new file mode 100644 index 0000000000000..11574abd07b19 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsStreamCopier.java @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.gcs; + +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.WriteChannel; +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.BlobInfo; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.jdbc.StagingFilenameGenerator; +import io.airbyte.integrations.destination.jdbc.constants.GlobalDataSizeConstants; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopier; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.time.Instant; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.UUID; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class GcsStreamCopier implements StreamCopier { + + private static final Logger LOGGER = LoggerFactory.getLogger(GcsStreamCopier.class); + // It is optimal to write every 10,000,000 records (BATCH_SIZE * MAX_PER_FILE_PART_COUNT) to a new + // file. + // The BATCH_SIZE is defined in CopyConsumerFactory. + // The average size of such a file will be about 1 GB. + // This will make it easier to work with files and speed up the recording of large amounts of data. + // In addition, for a large number of records, we will not get a drop in the copy request to + // QUERY_TIMEOUT when + // the records from the file are copied to the staging table. + public static final int MAX_PARTS_PER_FILE = 1000; + protected final GcsConfig gcsConfig; + protected final String tmpTableName; + protected final String schemaName; + protected final String streamName; + protected final JdbcDatabase db; + protected final Set gcsStagingFiles = new HashSet<>(); + protected final String stagingFolder; + protected StagingFilenameGenerator filenameGenerator; + private final Storage storageClient; + private final DestinationSyncMode destSyncMode; + private final StandardNameTransformer nameTransformer; + private final SqlOperations sqlOperations; + private final HashMap channels = new HashMap<>(); + private final HashMap csvPrinters = new HashMap<>(); + + public GcsStreamCopier(final String stagingFolder, + final DestinationSyncMode destSyncMode, + final String schema, + final String streamName, + final Storage storageClient, + final JdbcDatabase db, + final GcsConfig gcsConfig, + final StandardNameTransformer nameTransformer, + final SqlOperations sqlOperations) { + this.destSyncMode = destSyncMode; + this.schemaName = schema; + this.streamName = streamName; + this.stagingFolder = stagingFolder; + this.db = db; + this.nameTransformer = nameTransformer; + this.sqlOperations = sqlOperations; + this.tmpTableName = nameTransformer.getTmpTableName(streamName); + this.storageClient = storageClient; + this.gcsConfig = gcsConfig; + this.filenameGenerator = new StagingFilenameGenerator(streamName, GlobalDataSizeConstants.DEFAULT_MAX_BATCH_SIZE_BYTES); + } + + private String prepareGcsStagingFile() { + return String.join("/", stagingFolder, schemaName, filenameGenerator.getStagingFilename()); + } + + @Override + public String prepareStagingFile() { + final var name = prepareGcsStagingFile(); + if (!gcsStagingFiles.contains(name)) { + gcsStagingFiles.add(name); + final var blobId = BlobId.of(gcsConfig.getBucketName(), name); + final var blobInfo = BlobInfo.newBuilder(blobId).build(); + final var blob = storageClient.create(blobInfo); + final var channel = blob.writer(); + channels.put(name, channel); + final OutputStream outputStream = Channels.newOutputStream(channel); + + final var writer = new PrintWriter(outputStream, true, StandardCharsets.UTF_8); + try { + csvPrinters.put(name, new CSVPrinter(writer, CSVFormat.DEFAULT)); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + return name; + } + + @Override + public void write(final UUID id, final AirbyteRecordMessage recordMessage, final String gcsFileName) throws Exception { + if (csvPrinters.containsKey(gcsFileName)) { + csvPrinters.get(gcsFileName).printRecord(id, + Jsons.serialize(recordMessage.getData()), + Timestamp.from(Instant.ofEpochMilli(recordMessage.getEmittedAt()))); + } + } + + @Override + public void closeNonCurrentStagingFileWriters() throws Exception { + // TODO need to update this method when updating whole class for using GcsWriter + } + + @Override + public void closeStagingUploader(final boolean hasFailed) throws Exception { + LOGGER.info("Uploading remaining data for {} stream.", streamName); + for (final var csvPrinter : csvPrinters.values()) { + csvPrinter.close(); + } + for (final var channel : channels.values()) { + channel.close(); + } + LOGGER.info("All data for {} stream uploaded.", streamName); + } + + @Override + public void copyStagingFileToTemporaryTable() throws Exception { + LOGGER.info("Starting copy to tmp table: {} in destination for stream: {}, schema: {}.", tmpTableName, streamName, schemaName); + for (final var gcsStagingFile : gcsStagingFiles) { + copyGcsCsvFileIntoTable(db, getFullGcsPath(gcsConfig.getBucketName(), gcsStagingFile), schemaName, tmpTableName, gcsConfig); + } + LOGGER.info("Copy to tmp table {} in destination for stream {} complete.", tmpTableName, streamName); + } + + @Override + public void removeFileAndDropTmpTable() throws Exception { + for (final var gcsStagingFile : gcsStagingFiles) { + LOGGER.info("Begin cleaning gcs staging file {}.", gcsStagingFile); + final var blobId = BlobId.of(gcsConfig.getBucketName(), gcsStagingFile); + if (storageClient.get(blobId).exists()) { + storageClient.delete(blobId); + } + LOGGER.info("GCS staging file {} cleaned.", gcsStagingFile); + } + + LOGGER.info("Begin cleaning {} tmp table in destination.", tmpTableName); + sqlOperations.dropTableIfExists(db, schemaName, tmpTableName); + LOGGER.info("{} tmp table in destination cleaned.", tmpTableName); + } + + @Override + public void createDestinationSchema() throws Exception { + LOGGER.info("Creating schema in destination if it doesn't exist: {}", schemaName); + sqlOperations.createSchemaIfNotExists(db, schemaName); + } + + @Override + public void createTemporaryTable() throws Exception { + LOGGER.info("Preparing tmp table in destination for stream: {}, schema: {}, tmp table name: {}.", streamName, schemaName, tmpTableName); + sqlOperations.createTableIfNotExists(db, schemaName, tmpTableName); + } + + @Override + public String createDestinationTable() throws Exception { + final var destTableName = nameTransformer.getRawTableName(streamName); + LOGGER.info("Preparing table {} in destination.", destTableName); + sqlOperations.createTableIfNotExists(db, schemaName, destTableName); + LOGGER.info("Table {} in destination prepared.", tmpTableName); + + return destTableName; + } + + @Override + public String generateMergeStatement(final String destTableName) throws Exception { + LOGGER.info("Preparing to merge tmp table {} to dest table: {}, schema: {}, in destination.", tmpTableName, destTableName, schemaName); + final var queries = new StringBuilder(); + if (destSyncMode.equals(DestinationSyncMode.OVERWRITE)) { + queries.append(sqlOperations.truncateTableQuery(db, schemaName, destTableName)); + LOGGER.info("Destination OVERWRITE mode detected. Dest table: {}, schema: {}, will be truncated.", destTableName, schemaName); + } + queries.append(sqlOperations.insertTableQuery(db, schemaName, tmpTableName, destTableName)); + return queries.toString(); + } + + @Override + public String getCurrentFile() { + // TODO need to update this method when updating whole class for using GcsWriter + return null; + } + + private static String getFullGcsPath(final String bucketName, final String stagingFile) { + // this is intentionally gcs:/ not gcs:// since the join adds the additional slash + return String.join("/", "gcs:/", bucketName, stagingFile); + } + + public static void attemptWriteToPersistence(final GcsConfig gcsConfig) throws IOException { + final String outputTableName = "_airbyte_connection_test_" + UUID.randomUUID().toString().replaceAll("-", ""); + attemptWriteAndDeleteGcsObject(gcsConfig, outputTableName); + } + + private static void attemptWriteAndDeleteGcsObject(final GcsConfig gcsConfig, final String outputTableName) throws IOException { + final var storage = getStorageClient(gcsConfig); + final var blobId = BlobId.of(gcsConfig.getBucketName(), "check-content/" + outputTableName); + final var blobInfo = BlobInfo.newBuilder(blobId).build(); + + storage.create(blobInfo, "".getBytes(StandardCharsets.UTF_8)); + storage.delete(blobId); + } + + public static Storage getStorageClient(final GcsConfig gcsConfig) throws IOException { + final InputStream credentialsInputStream = new ByteArrayInputStream(gcsConfig.getCredentialsJson().getBytes(StandardCharsets.UTF_8)); + final GoogleCredentials credentials = GoogleCredentials.fromStream(credentialsInputStream); + return StorageOptions.newBuilder() + .setCredentials(credentials) + .setProjectId(gcsConfig.getProjectId()) + .build() + .getService(); + } + + @VisibleForTesting + public String getTmpTableName() { + return tmpTableName; + } + + @VisibleForTesting + public Set getGcsStagingFiles() { + return gcsStagingFiles; + } + + public abstract void copyGcsCsvFileIntoTable(JdbcDatabase database, + String gcsFileLocation, + String schema, + String tableName, + GcsConfig gcsConfig) + throws SQLException; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsStreamCopierFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsStreamCopierFactory.java new file mode 100644 index 0000000000000..df1cbad060f4b --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/gcs/GcsStreamCopierFactory.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.gcs; + +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopier; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopierFactory; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +public abstract class GcsStreamCopierFactory implements StreamCopierFactory { + + /** + * Used by the copy consumer. + */ + @Override + public StreamCopier create(final String configuredSchema, + final GcsConfig gcsConfig, + final String stagingFolder, + final ConfiguredAirbyteStream configuredStream, + final StandardNameTransformer nameTransformer, + final JdbcDatabase db, + final SqlOperations sqlOperations) { + try { + final AirbyteStream stream = configuredStream.getStream(); + final DestinationSyncMode syncMode = configuredStream.getDestinationSyncMode(); + final String schema = StreamCopierFactory.getSchema(stream.getNamespace(), configuredSchema, nameTransformer); + + final InputStream credentialsInputStream = new ByteArrayInputStream(gcsConfig.getCredentialsJson().getBytes(StandardCharsets.UTF_8)); + final GoogleCredentials credentials = GoogleCredentials.fromStream(credentialsInputStream); + final Storage storageClient = StorageOptions.newBuilder() + .setCredentials(credentials) + .setProjectId(gcsConfig.getProjectId()) + .build() + .getService(); + + return create(stagingFolder, syncMode, schema, stream.getName(), storageClient, db, gcsConfig, nameTransformer, sqlOperations); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + /** + * For specific copier suppliers to implement. + */ + public abstract StreamCopier create(String stagingFolder, + DestinationSyncMode syncMode, + String schema, + String streamName, + Storage storageClient, + JdbcDatabase db, + GcsConfig gcsConfig, + StandardNameTransformer nameTransformer, + SqlOperations sqlOperations) + throws Exception; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3CopyConfig.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3CopyConfig.java new file mode 100644 index 0000000000000..9f782e0c176c1 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3CopyConfig.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.s3; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.integrations.destination.s3.S3DestinationConfig; + +/** + * S3 copy destinations need an S3DestinationConfig to configure the basic upload behavior. We also + * want additional flags to configure behavior that only applies to the copy-to-S3 + + * load-into-warehouse portion. Currently this is just purgeStagingData, but this may expand. + */ +public record S3CopyConfig(boolean purgeStagingData, S3DestinationConfig s3Config) { + + public static boolean shouldPurgeStagingData(final JsonNode config) { + if (config.get("purge_staging_data") == null) { + return true; + } else { + return config.get("purge_staging_data").asBoolean(); + } + } + + public static S3CopyConfig getS3CopyConfig(final JsonNode config) { + return new S3CopyConfig(S3CopyConfig.shouldPurgeStagingData(config), + S3DestinationConfig.getS3DestinationConfig(config)); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopier.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopier.java new file mode 100644 index 0000000000000..10d0da5e880ff --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopier.java @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.s3; + +import com.amazonaws.services.s3.AmazonS3; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopier; +import io.airbyte.integrations.destination.s3.S3DestinationConfig; +import io.airbyte.integrations.destination.s3.S3FormatConfig; +import io.airbyte.integrations.destination.s3.csv.S3CsvFormatConfig; +import io.airbyte.integrations.destination.s3.csv.S3CsvWriter; +import io.airbyte.integrations.destination.s3.csv.StagingDatabaseCsvSheetGenerator; +import io.airbyte.integrations.destination.s3.util.CompressionType; +import io.airbyte.integrations.destination.s3.writer.DestinationFileWriter; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import java.io.IOException; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import org.apache.commons.csv.CSVFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class S3StreamCopier implements StreamCopier { + + private static final Logger LOGGER = LoggerFactory.getLogger(S3StreamCopier.class); + + private static final int DEFAULT_UPLOAD_THREADS = 10; // The S3 cli uses 10 threads by default. + private static final int DEFAULT_QUEUE_CAPACITY = DEFAULT_UPLOAD_THREADS; + + protected final AmazonS3 s3Client; + protected final S3DestinationConfig s3Config; + protected final String tmpTableName; + protected final String schemaName; + protected final String streamName; + protected final JdbcDatabase db; + protected final ConfiguredAirbyteStream configuredAirbyteStream; + protected final String stagingFolder; + protected final Map stagingWritersByFile = new HashMap<>(); + private final DestinationSyncMode destSyncMode; + private final StandardNameTransformer nameTransformer; + private final SqlOperations sqlOperations; + private final Timestamp uploadTime; + protected final Set activeStagingWriterFileNames = new HashSet<>(); + protected final Set stagingFileNames = new LinkedHashSet<>(); + private final boolean purgeStagingData; + + // The number of batches of records that will be inserted into each file. + private final int maxPartsPerFile; + // The number of batches inserted into the current file. + private int partsAddedToCurrentFile; + private String currentFile; + + public S3StreamCopier(final String stagingFolder, + final String schema, + final AmazonS3 client, + final JdbcDatabase db, + final S3CopyConfig config, + final StandardNameTransformer nameTransformer, + final SqlOperations sqlOperations, + final ConfiguredAirbyteStream configuredAirbyteStream, + final Timestamp uploadTime, + final int maxPartsPerFile) { + this.destSyncMode = configuredAirbyteStream.getDestinationSyncMode(); + this.schemaName = schema; + this.streamName = configuredAirbyteStream.getStream().getName(); + this.stagingFolder = stagingFolder; + this.db = db; + this.nameTransformer = nameTransformer; + this.sqlOperations = sqlOperations; + this.configuredAirbyteStream = configuredAirbyteStream; + this.uploadTime = uploadTime; + this.tmpTableName = nameTransformer.getTmpTableName(this.streamName); + this.s3Client = client; + this.s3Config = config.s3Config(); + this.purgeStagingData = config.purgeStagingData(); + + this.maxPartsPerFile = maxPartsPerFile; + this.partsAddedToCurrentFile = 0; + } + + @Override + public String prepareStagingFile() { + if (partsAddedToCurrentFile == 0) { + + try { + // The Flattening value is actually ignored, because we pass an explicit CsvSheetGenerator. So just + // pass in null. + final S3FormatConfig csvFormatConfig = new S3CsvFormatConfig(null, CompressionType.NO_COMPRESSION); + final S3DestinationConfig writerS3Config = S3DestinationConfig.create(s3Config).withFormatConfig(csvFormatConfig).get(); + final S3CsvWriter writer = new S3CsvWriter.Builder( + writerS3Config, + s3Client, + configuredAirbyteStream, + uploadTime) + .uploadThreads(DEFAULT_UPLOAD_THREADS) + .queueCapacity(DEFAULT_QUEUE_CAPACITY) + .csvSettings(CSVFormat.DEFAULT) + .withHeader(false) + .csvSheetGenerator(new StagingDatabaseCsvSheetGenerator()) + .build(); + currentFile = writer.getOutputPath(); + stagingWritersByFile.put(currentFile, writer); + activeStagingWriterFileNames.add(currentFile); + stagingFileNames.add(currentFile); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + partsAddedToCurrentFile = (partsAddedToCurrentFile + 1) % maxPartsPerFile; + return currentFile; + } + + @Override + public void write(final UUID id, final AirbyteRecordMessage recordMessage, final String filename) throws Exception { + if (stagingWritersByFile.containsKey(filename)) { + stagingWritersByFile.get(filename).write(id, recordMessage); + } + } + + @Override + public void closeNonCurrentStagingFileWriters() throws Exception { + final Set removedKeys = new HashSet<>(); + for (final String key : activeStagingWriterFileNames) { + if (!key.equals(currentFile)) { + stagingWritersByFile.get(key).close(false); + stagingWritersByFile.remove(key); + removedKeys.add(key); + } + } + activeStagingWriterFileNames.removeAll(removedKeys); + } + + @Override + public void closeStagingUploader(final boolean hasFailed) throws Exception { + for (final DestinationFileWriter writer : stagingWritersByFile.values()) { + writer.close(hasFailed); + } + } + + @Override + public void createDestinationSchema() throws Exception { + LOGGER.info("Creating schema in destination if it doesn't exist: {}", schemaName); + sqlOperations.createSchemaIfNotExists(db, schemaName); + } + + @Override + public void createTemporaryTable() throws Exception { + LOGGER.info("Preparing tmp table in destination for stream: {}, schema: {}, tmp table name: {}.", streamName, schemaName, tmpTableName); + sqlOperations.createTableIfNotExists(db, schemaName, tmpTableName); + } + + @Override + public void copyStagingFileToTemporaryTable() throws Exception { + LOGGER.info("Starting copy to tmp table: {} in destination for stream: {}, schema: {}, .", tmpTableName, streamName, schemaName); + for (final String fileName : stagingFileNames) { + copyS3CsvFileIntoTable(db, getFullS3Path(s3Config.getBucketName(), fileName), schemaName, tmpTableName, s3Config); + } + LOGGER.info("Copy to tmp table {} in destination for stream {} complete.", tmpTableName, streamName); + } + + @Override + public String createDestinationTable() throws Exception { + final var destTableName = nameTransformer.getRawTableName(streamName); + LOGGER.info("Preparing table {} in destination.", destTableName); + sqlOperations.createTableIfNotExists(db, schemaName, destTableName); + LOGGER.info("Table {} in destination prepared.", tmpTableName); + + return destTableName; + } + + @Override + public String generateMergeStatement(final String destTableName) { + LOGGER.info("Preparing to merge tmp table {} to dest table: {}, schema: {}, in destination.", tmpTableName, destTableName, schemaName); + final var queries = new StringBuilder(); + if (destSyncMode.equals(DestinationSyncMode.OVERWRITE)) { + queries.append(sqlOperations.truncateTableQuery(db, schemaName, destTableName)); + LOGGER.info("Destination OVERWRITE mode detected. Dest table: {}, schema: {}, truncated.", destTableName, schemaName); + } + queries.append(sqlOperations.insertTableQuery(db, schemaName, tmpTableName, destTableName)); + return queries.toString(); + } + + @Override + public void removeFileAndDropTmpTable() throws Exception { + if (purgeStagingData) { + for (final String fileName : stagingFileNames) { + s3Client.deleteObject(s3Config.getBucketName(), fileName); + LOGGER.info("S3 staging file {} cleaned.", fileName); + } + } + + LOGGER.info("Begin cleaning {} tmp table in destination.", tmpTableName); + sqlOperations.dropTableIfExists(db, schemaName, tmpTableName); + LOGGER.info("{} tmp table in destination cleaned.", tmpTableName); + } + + @Override + public String getCurrentFile() { + return currentFile; + } + + protected static String getFullS3Path(final String s3BucketName, final String s3StagingFile) { + return String.join("/", "s3:/", s3BucketName, s3StagingFile); + } + + @VisibleForTesting + public String getTmpTableName() { + return tmpTableName; + } + + @VisibleForTesting + public Map getStagingWritersByFile() { + return stagingWritersByFile; + } + + @VisibleForTesting + public Set getStagingFiles() { + return stagingFileNames; + } + + public abstract void copyS3CsvFileIntoTable(JdbcDatabase database, + String s3FileLocation, + String schema, + String tableName, + S3DestinationConfig s3Config) + throws SQLException; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopierFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopierFactory.java new file mode 100644 index 0000000000000..0a7c4a90f0a35 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopierFactory.java @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.s3; + +import com.amazonaws.services.s3.AmazonS3; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopier; +import io.airbyte.integrations.destination.jdbc.copy.StreamCopierFactory; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; + +public abstract class S3StreamCopierFactory implements StreamCopierFactory { + + /** + * Used by the copy consumer. + */ + @Override + public StreamCopier create(final String configuredSchema, + final S3CopyConfig config, + final String stagingFolder, + final ConfiguredAirbyteStream configuredStream, + final StandardNameTransformer nameTransformer, + final JdbcDatabase db, + final SqlOperations sqlOperations) { + try { + final AirbyteStream stream = configuredStream.getStream(); + final String schema = StreamCopierFactory.getSchema(stream.getNamespace(), configuredSchema, nameTransformer); + final AmazonS3 s3Client = config.s3Config().getS3Client(); + + return create(stagingFolder, schema, s3Client, db, config, nameTransformer, sqlOperations, configuredStream); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + /** + * For specific copier suppliers to implement. + */ + protected abstract StreamCopier create(String stagingFolder, + String schema, + AmazonS3 s3Client, + JdbcDatabase db, + S3CopyConfig config, + StandardNameTransformer nameTransformer, + SqlOperations sqlOperations, + ConfiguredAirbyteStream configuredStream) + throws Exception; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/staging/StagingConsumerFactory.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/staging/StagingConsumerFactory.java new file mode 100644 index 0000000000000..ab5e29850f181 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/staging/StagingConsumerFactory.java @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.staging; + +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.base.AirbyteMessageConsumer; +import io.airbyte.integrations.destination.NamingConventionTransformer; +import io.airbyte.integrations.destination.buffered_stream_consumer.BufferedStreamConsumer; +import io.airbyte.integrations.destination.buffered_stream_consumer.OnCloseFunction; +import io.airbyte.integrations.destination.buffered_stream_consumer.OnStartFunction; +import io.airbyte.integrations.destination.jdbc.WriteConfig; +import io.airbyte.integrations.destination.record_buffer.BufferCreateFunction; +import io.airbyte.integrations.destination.record_buffer.FlushBufferFunction; +import io.airbyte.integrations.destination.record_buffer.SerializedBufferingStrategy; +import io.airbyte.protocol.models.v0.AirbyteMessage; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.function.Consumer; +import java.util.function.Function; +import org.apache.commons.io.FileUtils; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Uses both Factory and Consumer design pattern to create a single point of creation for consuming + * {@link AirbyteMessage} for processing + */ +public class StagingConsumerFactory { + + private static final Logger LOGGER = LoggerFactory.getLogger(StagingConsumerFactory.class); + + // using a random string here as a placeholder for the moment. + // This would avoid mixing data in the staging area between different syncs (especially if they + // manipulate streams with similar names) + // if we replaced the random connection id by the actual connection_id, we'd gain the opportunity to + // leverage data that was uploaded to stage + // in a previous attempt but failed to load to the warehouse for some reason (interrupted?) instead. + // This would also allow other programs/scripts + // to load (or reload backups?) in the connection's staging area to be loaded at the next sync. + private static final DateTime SYNC_DATETIME = DateTime.now(DateTimeZone.UTC); + private final UUID RANDOM_CONNECTION_ID = UUID.randomUUID(); + + public AirbyteMessageConsumer create(final Consumer outputRecordCollector, + final JdbcDatabase database, + final StagingOperations stagingOperations, + final NamingConventionTransformer namingResolver, + final BufferCreateFunction onCreateBuffer, + final JsonNode config, + final ConfiguredAirbyteCatalog catalog, + final boolean purgeStagingData) { + final List writeConfigs = createWriteConfigs(namingResolver, config, catalog); + return new BufferedStreamConsumer( + outputRecordCollector, + onStartFunction(database, stagingOperations, writeConfigs), + new SerializedBufferingStrategy( + onCreateBuffer, + catalog, + flushBufferFunction(database, stagingOperations, writeConfigs, catalog)), + onCloseFunction(database, stagingOperations, writeConfigs, purgeStagingData), + catalog, + stagingOperations::isValidData); + } + + /** + * Creates a list of all {@link WriteConfig} for each stream within a + * {@link ConfiguredAirbyteCatalog}. Each write config represents the configuration settings for + * writing to a destination connector + * + * @param namingResolver {@link NamingConventionTransformer} used to transform names that are + * acceptable by each destination connector + * @param config destination connector configuration parameters + * @param catalog {@link ConfiguredAirbyteCatalog} collection of configured + * {@link ConfiguredAirbyteStream} + * @return list of all write configs for each stream in a {@link ConfiguredAirbyteCatalog} + */ + private static List createWriteConfigs(final NamingConventionTransformer namingResolver, + final JsonNode config, + final ConfiguredAirbyteCatalog catalog) { + + return catalog.getStreams().stream().map(toWriteConfig(namingResolver, config)).collect(toList()); + } + + private static Function toWriteConfig(final NamingConventionTransformer namingResolver, + final JsonNode config) { + return stream -> { + Preconditions.checkNotNull(stream.getDestinationSyncMode(), "Undefined destination sync mode"); + final AirbyteStream abStream = stream.getStream(); + + final String outputSchema = getOutputSchema(abStream, config.get("schema").asText(), namingResolver); + + final String streamName = abStream.getName(); + final String tableName = namingResolver.getRawTableName(streamName); + final String tmpTableName = namingResolver.getTmpTableName(streamName); + final DestinationSyncMode syncMode = stream.getDestinationSyncMode(); + + final WriteConfig writeConfig = + new WriteConfig(streamName, abStream.getNamespace(), outputSchema, tmpTableName, tableName, syncMode, SYNC_DATETIME); + LOGGER.info("Write config: {}", writeConfig); + + return writeConfig; + }; + } + + private static String getOutputSchema(final AirbyteStream stream, + final String defaultDestSchema, + final NamingConventionTransformer namingResolver) { + return stream.getNamespace() != null + ? namingResolver.getNamespace(stream.getNamespace()) + : namingResolver.getNamespace(defaultDestSchema); + } + + private OnStartFunction onStartFunction(final JdbcDatabase database, + final StagingOperations stagingOperations, + final List writeConfigs) { + return () -> { + LOGGER.info("Preparing raw tables in destination started for {} streams", writeConfigs.size()); + final List queryList = new ArrayList<>(); + for (final WriteConfig writeConfig : writeConfigs) { + final String schema = writeConfig.getOutputSchemaName(); + final String stream = writeConfig.getStreamName(); + final String dstTableName = writeConfig.getOutputTableName(); + final String stageName = stagingOperations.getStageName(schema, stream); + final String stagingPath = stagingOperations.getStagingPath(RANDOM_CONNECTION_ID, schema, stream, writeConfig.getWriteDatetime()); + + LOGGER.info("Preparing staging area in destination started for schema {} stream {}: target table: {}, stage: {}", + schema, stream, dstTableName, stagingPath); + + stagingOperations.createSchemaIfNotExists(database, schema); + stagingOperations.createTableIfNotExists(database, schema, dstTableName); + stagingOperations.createStageIfNotExists(database, stageName); + + /* + * When we're in OVERWRITE, clear out the table at the start of a sync, this is an expected side + * effect of checkpoint and the removal of temporary tables + */ + switch (writeConfig.getSyncMode()) { + case OVERWRITE -> queryList.add(stagingOperations.truncateTableQuery(database, schema, dstTableName)); + case APPEND, APPEND_DEDUP -> {} + default -> throw new IllegalStateException("Unrecognized sync mode: " + writeConfig.getSyncMode()); + } + + LOGGER.info("Preparing staging area in destination completed for schema {} stream {}", schema, stream); + } + LOGGER.info("Executing finalization of tables."); + stagingOperations.executeTransaction(database, queryList); + }; + } + + private static AirbyteStreamNameNamespacePair toNameNamespacePair(final WriteConfig config) { + return new AirbyteStreamNameNamespacePair(config.getStreamName(), config.getNamespace()); + } + + /** + * Logic handling how destinations with staging areas (aka bucket storages) will flush their buffer + * + * @param database database used for syncing + * @param stagingOperations collection of SQL queries necessary for writing data into a staging area + * @param writeConfigs configuration settings for all destination connectors needed to write + * @param catalog collection of configured streams (e.g. API endpoints or database tables) + * @return + */ + @VisibleForTesting + FlushBufferFunction flushBufferFunction( + final JdbcDatabase database, + final StagingOperations stagingOperations, + final List writeConfigs, + final ConfiguredAirbyteCatalog catalog) { + // TODO: (ryankfu) move this block of code that executes before the lambda to #onStartFunction + final Set conflictingStreams = new HashSet<>(); + final Map pairToWriteConfig = new HashMap<>(); + for (final WriteConfig config : writeConfigs) { + final AirbyteStreamNameNamespacePair streamIdentifier = toNameNamespacePair(config); + if (pairToWriteConfig.containsKey(streamIdentifier)) { + conflictingStreams.add(config); + final WriteConfig existingConfig = pairToWriteConfig.get(streamIdentifier); + // The first conflicting stream won't have any problems, so we need to explicitly add it here. + conflictingStreams.add(existingConfig); + } else { + pairToWriteConfig.put(streamIdentifier, config); + } + } + if (!conflictingStreams.isEmpty()) { + final String message = String.format( + "You are trying to write multiple streams to the same table. Consider switching to a custom namespace format using ${SOURCE_NAMESPACE}, or moving one of them into a separate connection with a different stream prefix. Affected streams: %s", + conflictingStreams.stream().map(config -> config.getNamespace() + "." + config.getStreamName()).collect(joining(", "))); + throw new ConfigErrorException(message); + } + return (pair, writer) -> { + LOGGER.info("Flushing buffer for stream {} ({}) to staging", pair.getName(), FileUtils.byteCountToDisplaySize(writer.getByteCount())); + if (!pairToWriteConfig.containsKey(pair)) { + throw new IllegalArgumentException( + String.format("Message contained record from a stream that was not in the catalog. \ncatalog: %s", Jsons.serialize(catalog))); + } + + final WriteConfig writeConfig = pairToWriteConfig.get(pair); + final String schemaName = writeConfig.getOutputSchemaName(); + final String stageName = stagingOperations.getStageName(schemaName, writeConfig.getStreamName()); + final String stagingPath = + stagingOperations.getStagingPath(RANDOM_CONNECTION_ID, schemaName, writeConfig.getStreamName(), writeConfig.getWriteDatetime()); + try (writer) { + writer.flush(); + final String stagedFile = stagingOperations.uploadRecordsToStage(database, writer, schemaName, stageName, stagingPath); + copyIntoTableFromStage(database, stageName, stagingPath, List.of(stagedFile), writeConfig.getOutputTableName(), schemaName, + stagingOperations); + } catch (final Exception e) { + LOGGER.error("Failed to flush and commit buffer data into destination's raw table", e); + throw new RuntimeException("Failed to upload buffer to stage and commit to destination", e); + } + }; + } + + /** + * Handles copying data from staging area to destination table and clean up of staged files if + * upload was unsuccessful + */ + private void copyIntoTableFromStage(final JdbcDatabase database, + final String stageName, + final String stagingPath, + final List stagedFiles, + final String tableName, + final String schemaName, + final StagingOperations stagingOperations) + throws Exception { + try { + stagingOperations.copyIntoTableFromStage(database, stageName, stagingPath, stagedFiles, + tableName, schemaName); + } catch (final Exception e) { + stagingOperations.cleanUpStage(database, stageName, stagedFiles); + LOGGER.info("Cleaning stage path {}", stagingPath); + throw new RuntimeException("Failed to upload data from stage " + stagingPath, e); + } + } + + /** + * Tear down process, will attempt to try to clean out any staging area + * + * @param database database used for syncing + * @param stagingOperations collection of SQL queries necessary for writing data into a staging area + * @param writeConfigs configuration settings for all destination connectors needed to write + * @param purgeStagingData drop staging area if true, keep otherwise + * @return + */ + private OnCloseFunction onCloseFunction(final JdbcDatabase database, + final StagingOperations stagingOperations, + final List writeConfigs, + final boolean purgeStagingData) { + return (hasFailed) -> { + if (!hasFailed) { + stagingOperations.onDestinationCloseOperations(database, writeConfigs); + LOGGER.info("Finalizing tables in destination completed."); + } + // After moving data from staging area to the target table (airybte_raw) clean up the staging + // area (if user configured) + LOGGER.info("Cleaning up destination started for {} streams", writeConfigs.size()); + for (final WriteConfig writeConfig : writeConfigs) { + final String schemaName = writeConfig.getOutputSchemaName(); + if (purgeStagingData) { + final String stageName = stagingOperations.getStageName(schemaName, writeConfig.getStreamName()); + LOGGER.info("Cleaning stage in destination started for stream {}. schema {}, stage: {}", writeConfig.getStreamName(), schemaName, + stageName); + stagingOperations.dropStageIfExists(database, stageName); + } + } + LOGGER.info("Cleaning up destination completed."); + }; + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/staging/StagingOperations.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/staging/StagingOperations.java new file mode 100644 index 0000000000000..4eae42d04e230 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/java/io/airbyte/integrations/destination/staging/StagingOperations.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.staging; + +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.record_buffer.SerializableBuffer; +import java.util.List; +import java.util.UUID; +import org.joda.time.DateTime; + +/** + * Staging operations focuses on the SQL queries that are needed to success move data into a staging + * environment like GCS or S3. In general, the reference of staging is the usage of an object + * storage for the purposes of efficiently uploading bulk data to destinations + */ +public interface StagingOperations extends SqlOperations { + + /** + * Returns the staging environment's name + * + * @param namespace Name of schema + * @param streamName Name of the stream + * @return Fully qualified name of the staging environment + */ + String getStageName(String namespace, String streamName); + + String getStagingPath(UUID connectionId, String namespace, String streamName, DateTime writeDatetime); + + /** + * Create a staging folder where to upload temporary files before loading into the final destination + */ + void createStageIfNotExists(JdbcDatabase database, String stageName) throws Exception; + + /** + * Upload the data file into the stage area. + * + * @param database database used for syncing + * @param recordsData records stored in in-memory buffer + * @param schemaName name of schema + * @param stageName name of the staging area folder + * @param stagingPath path of staging folder to data files + * @return the name of the file that was uploaded. + */ + String uploadRecordsToStage(JdbcDatabase database, SerializableBuffer recordsData, String schemaName, String stageName, String stagingPath) + throws Exception; + + /** + * Load the data stored in the stage area into a temporary table in the destination + * + * @param database database interface + * @param stageName name of staging area folder + * @param stagingPath path to staging files + * @param stagedFiles collection of staged files + * @param tableName name of table to write staging files to + * @param schemaName name of schema + */ + void copyIntoTableFromStage(JdbcDatabase database, + String stageName, + String stagingPath, + List stagedFiles, + String tableName, + String schemaName) + throws Exception; + + /** + * Remove files that were just staged + * + * @param database database used for syncing + * @param stageName name of staging area folder + * @param stagedFiles collection of the staging files to remove + */ + void cleanUpStage(JdbcDatabase database, String stageName, List stagedFiles) throws Exception; + + /** + * Delete the stage area and all staged files that was in it + * + * @param database database used for syncing + * @param stageName Name of the staging area used to store files + */ + void dropStageIfExists(JdbcDatabase database, String stageName) throws Exception; + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/resources/spec.json b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/resources/spec.json new file mode 100644 index 0000000000000..e3a079c28e119 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/main/resources/spec.json @@ -0,0 +1,39 @@ +{ + "documentationUrl": "https://docs.airbyte.com/integrations/destinations/postgres", + "supportsIncremental": true, + "supportsNormalization": false, + "supportsDBT": false, + "supported_destination_sync_modes": ["overwrite", "append"], + "connectionSpecification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "JDBC Destination Spec", + "type": "object", + "required": ["username", "jdbc_url"], + "additionalProperties": true, + "properties": { + "username": { + "description": "The username which is used to access the database.", + "title": "Username", + "type": "string" + }, + "password": { + "description": "The password associated with this username.", + "title": "Password", + "type": "string", + "airbyte_secret": true + }, + "jdbc_url": { + "description": "JDBC formatted url. See the standard here.", + "title": "JDBC URL", + "type": "string" + }, + "schema": { + "description": "If you leave the schema unspecified, JDBC defaults to a schema named \"public\".", + "type": "string", + "examples": ["public"], + "default": "public", + "title": "Default Schema" + } + } + } +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/AbstractJdbcDestinationTest.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/AbstractJdbcDestinationTest.java new file mode 100644 index 0000000000000..413515a26deeb --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/AbstractJdbcDestinationTest.java @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; +import io.airbyte.commons.json.Jsons; +import io.airbyte.db.jdbc.JdbcUtils; +import io.airbyte.integrations.destination.StandardNameTransformer; +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.Test; + +public class AbstractJdbcDestinationTest { + + private JsonNode buildConfigNoJdbcParameters() { + return Jsons.jsonNode(ImmutableMap.of( + JdbcUtils.HOST_KEY, "localhost", + JdbcUtils.PORT_KEY, 1337, + JdbcUtils.USERNAME_KEY, "user", + JdbcUtils.DATABASE_KEY, "db")); + } + + private JsonNode buildConfigWithExtraJdbcParameters(final String extraParam) { + return Jsons.jsonNode(ImmutableMap.of( + JdbcUtils.HOST_KEY, "localhost", + JdbcUtils.PORT_KEY, 1337, + JdbcUtils.USERNAME_KEY, "user", + JdbcUtils.DATABASE_KEY, "db", + JdbcUtils.JDBC_URL_PARAMS_KEY, extraParam)); + } + + @Test + void testNoExtraParamsNoDefault() { + final Map connectionProperties = new TestJdbcDestination().getConnectionProperties(buildConfigNoJdbcParameters()); + + final Map expectedProperties = ImmutableMap.of(); + assertEquals(expectedProperties, connectionProperties); + } + + @Test + void testNoExtraParamsWithDefault() { + final Map defaultProperties = ImmutableMap.of("A_PARAMETER", "A_VALUE"); + + final Map connectionProperties = new TestJdbcDestination(defaultProperties).getConnectionProperties( + buildConfigNoJdbcParameters()); + + assertEquals(defaultProperties, connectionProperties); + } + + @Test + void testExtraParamNoDefault() { + final String extraParam = "key1=value1&key2=value2&key3=value3"; + final Map connectionProperties = new TestJdbcDestination().getConnectionProperties( + buildConfigWithExtraJdbcParameters(extraParam)); + final Map expectedProperties = ImmutableMap.of( + "key1", "value1", + "key2", "value2", + "key3", "value3"); + assertEquals(expectedProperties, connectionProperties); + } + + @Test + void testExtraParamWithDefault() { + final Map defaultProperties = ImmutableMap.of("A_PARAMETER", "A_VALUE"); + final String extraParam = "key1=value1&key2=value2&key3=value3"; + final Map connectionProperties = new TestJdbcDestination(defaultProperties).getConnectionProperties( + buildConfigWithExtraJdbcParameters(extraParam)); + final Map expectedProperties = ImmutableMap.of( + "A_PARAMETER", "A_VALUE", + "key1", "value1", + "key2", "value2", + "key3", "value3"); + assertEquals(expectedProperties, connectionProperties); + } + + @Test + void testExtraParameterEqualToDefault() { + final Map defaultProperties = ImmutableMap.of("key1", "value1"); + final String extraParam = "key1=value1&key2=value2&key3=value3"; + final Map connectionProperties = new TestJdbcDestination(defaultProperties).getConnectionProperties( + buildConfigWithExtraJdbcParameters(extraParam)); + final Map expectedProperties = ImmutableMap.of( + "key1", "value1", + "key2", "value2", + "key3", "value3"); + assertEquals(expectedProperties, connectionProperties); + } + + @Test + void testExtraParameterDiffersFromDefault() { + final Map defaultProperties = ImmutableMap.of("key1", "value0"); + final String extraParam = "key1=value1&key2=value2&key3=value3"; + + assertThrows(IllegalArgumentException.class, () -> new TestJdbcDestination(defaultProperties).getConnectionProperties( + buildConfigWithExtraJdbcParameters(extraParam))); + } + + @Test + void testInvalidExtraParam() { + final String extraParam = "key1=value1&sdf&"; + assertThrows(IllegalArgumentException.class, + () -> new TestJdbcDestination().getConnectionProperties(buildConfigWithExtraJdbcParameters(extraParam))); + } + + static class TestJdbcDestination extends AbstractJdbcDestination { + + private final Map defaultProperties; + + public TestJdbcDestination() { + this(new HashMap<>()); + } + + public TestJdbcDestination(final Map defaultProperties) { + super("", new StandardNameTransformer(), new TestJdbcSqlOperations()); + this.defaultProperties = defaultProperties; + } + + @Override + protected Map getDefaultConnectionProperties(final JsonNode config) { + return defaultProperties; + } + + @Override + public JsonNode toJdbcConfig(final JsonNode config) { + return config; + } + + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/DataAdapterTest.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/DataAdapterTest.java new file mode 100644 index 0000000000000..808ce78e267ae --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/DataAdapterTest.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import static org.junit.jupiter.api.Assertions.*; + +import com.fasterxml.jackson.databind.JsonNode; +import io.airbyte.commons.json.Jsons; +import java.util.function.Function; +import org.junit.jupiter.api.Test; + +class DataAdapterTest { + + private final JsonNode testData = Jsons.deserialize("{\"attr1\" : \"CCC\", \"obj1\" : [{\"sub1\" : \"BBB\"}, {\"sub1\" : \"CCC\"}]}"); + private final Function replaceCCCFunction = jsonNode -> { + if (jsonNode.isTextual()) { + String textValue = jsonNode.textValue().replaceAll("CCC", "FFF"); + return Jsons.jsonNode(textValue); + } else + return jsonNode; + }; + + @Test + public void checkSkipAll() { + final JsonNode data = testData.deepCopy(); + final DataAdapter adapter = new DataAdapter(jsonNode -> false, replaceCCCFunction); + adapter.adapt(data); + + assertEquals(testData, data); + } + + @Test + public void checkSkip() { + final JsonNode data = testData.deepCopy(); + final DataAdapter adapter = new DataAdapter(jsonNode -> jsonNode.isTextual() && jsonNode.textValue().contains("BBB"), replaceCCCFunction); + adapter.adapt(data); + + assertEquals(testData, data); + } + + @Test + public void checkAdapt() { + final JsonNode data = testData.deepCopy(); + final DataAdapter adapter = new DataAdapter(jsonNode -> jsonNode.isTextual() && jsonNode.textValue().contains("CCC"), replaceCCCFunction); + adapter.adapt(data); + System.out.println(data); + + assertNotEquals(testData, data); + assert (data.findValues("sub1").stream().anyMatch(jsonNode -> jsonNode.isTextual() && jsonNode.textValue().equals("FFF"))); + assert (data.findValues("attr1").stream().anyMatch(jsonNode -> jsonNode.isTextual() && jsonNode.textValue().equals("FFF"))); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/TestJdbcSqlOperations.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/TestJdbcSqlOperations.java new file mode 100644 index 0000000000000..9a27038cd698a --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/TestJdbcSqlOperations.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc; + +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.protocol.models.v0.AirbyteRecordMessage; +import java.sql.SQLException; +import java.util.List; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +public class TestJdbcSqlOperations extends JdbcSqlOperations { + + @Override + public void insertRecordsInternal(final JdbcDatabase database, + final List records, + final String schemaName, + final String tableName) + throws Exception { + // Not required for the testing + } + + @Test + public void testCreateSchemaIfNotExists() { + final JdbcDatabase db = Mockito.mock(JdbcDatabase.class); + final var schemaName = "foo"; + try { + Mockito.doThrow(new SQLException("TEST")).when(db).execute(Mockito.anyString()); + } catch (Exception e) { + // This would not be expected, but the `execute` method above will flag as an unhandled exception + assert false; + } + SQLException exception = Assertions.assertThrows(SQLException.class, () -> createSchemaIfNotExists(db, schemaName)); + Assertions.assertEquals(exception.getMessage(), "TEST"); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/SwitchingDestinationTest.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/SwitchingDestinationTest.java new file mode 100644 index 0000000000000..cd0b03a451a5a --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/SwitchingDestinationTest.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.ImmutableMap; +import io.airbyte.integrations.base.Destination; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; +import java.util.Map; +import java.util.function.Consumer; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class SwitchingDestinationTest { + + enum SwitchingEnum { + INSERT, + COPY + } + + private Destination insertDestination; + private Destination copyDestination; + private Map destinationMap; + + @BeforeEach + public void setUp() { + insertDestination = mock(Destination.class); + copyDestination = mock(Destination.class); + destinationMap = ImmutableMap.of( + SwitchingEnum.INSERT, insertDestination, + SwitchingEnum.COPY, copyDestination); + } + + @Test + public void testInsert() throws Exception { + final var switchingDestination = new SwitchingDestination<>(SwitchingEnum.class, c -> SwitchingEnum.INSERT, destinationMap); + + switchingDestination.getConsumer(mock(JsonNode.class), mock(ConfiguredAirbyteCatalog.class), mock(Consumer.class)); + + verify(insertDestination, times(1)).getConsumer(any(), any(), any()); + verify(copyDestination, times(0)).getConsumer(any(), any(), any()); + + switchingDestination.check(mock(JsonNode.class)); + + verify(insertDestination, times(1)).check(any()); + verify(copyDestination, times(0)).check(any()); + } + + @Test + public void testCopy() throws Exception { + final var switchingDestination = new SwitchingDestination<>(SwitchingEnum.class, c -> SwitchingEnum.COPY, destinationMap); + + switchingDestination.getConsumer(mock(JsonNode.class), mock(ConfiguredAirbyteCatalog.class), mock(Consumer.class)); + + verify(insertDestination, times(0)).getConsumer(any(), any(), any()); + verify(copyDestination, times(1)).getConsumer(any(), any(), any()); + + switchingDestination.check(mock(JsonNode.class)); + + verify(insertDestination, times(0)).check(any()); + verify(copyDestination, times(1)).check(any()); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3CopyConfigTest.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3CopyConfigTest.java new file mode 100644 index 0000000000000..6ffbcb9fc050d --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3CopyConfigTest.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.s3; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import org.junit.jupiter.api.Test; + +public class S3CopyConfigTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @Test + public void setsDefaultValues() throws IOException { + final boolean purgeStagingData = S3CopyConfig.shouldPurgeStagingData(OBJECT_MAPPER.readTree("{}")); + + assertTrue(purgeStagingData); + } + + @Test + public void parsesPurgeStagingDataCorrectly() throws IOException { + final boolean purgeStagingData = S3CopyConfig.shouldPurgeStagingData(OBJECT_MAPPER.readTree( + """ + { + "purge_staging_data": false + } + """)); + + assertFalse(purgeStagingData); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopierTest.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopierTest.java new file mode 100644 index 0000000000000..8933da9b0f48f --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/jdbc/copy/s3/S3StreamCopierTest.java @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.jdbc.copy.s3; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockConstruction; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; + +import com.amazonaws.services.s3.AmazonS3Client; +import com.google.common.collect.Lists; +import io.airbyte.db.jdbc.JdbcDatabase; +import io.airbyte.integrations.destination.StandardNameTransformer; +import io.airbyte.integrations.destination.jdbc.SqlOperations; +import io.airbyte.integrations.destination.s3.S3DestinationConfig; +import io.airbyte.integrations.destination.s3.csv.CsvSheetGenerator; +import io.airbyte.integrations.destination.s3.csv.S3CsvFormatConfig; +import io.airbyte.integrations.destination.s3.csv.S3CsvWriter; +import io.airbyte.integrations.destination.s3.csv.StagingDatabaseCsvSheetGenerator; +import io.airbyte.integrations.destination.s3.util.CompressionType; +import io.airbyte.protocol.models.v0.AirbyteStream; +import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; +import io.airbyte.protocol.models.v0.DestinationSyncMode; +import io.airbyte.protocol.models.v0.SyncMode; +import java.sql.Timestamp; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import org.apache.commons.csv.CSVFormat; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.MockedConstruction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class S3StreamCopierTest { + + private static final Logger LOGGER = LoggerFactory.getLogger(S3StreamCopierTest.class); + + private static final S3DestinationConfig S3_CONFIG = S3DestinationConfig.create( + "fake-bucket", + "fake-bucketPath", + "fake-region") + .withEndpoint("fake-endpoint") + .withAccessKeyCredential("fake-access-key-id", "fake-secret-access-key") + .get(); + private static final ConfiguredAirbyteStream CONFIGURED_STREAM = new ConfiguredAirbyteStream() + .withDestinationSyncMode(DestinationSyncMode.APPEND) + .withStream(new AirbyteStream() + .withName("fake-stream") + .withNamespace("fake-namespace") + .withSupportedSyncModes(Lists.newArrayList(SyncMode.FULL_REFRESH))); + private static final int UPLOAD_THREADS = 10; + private static final int QUEUE_CAPACITY = 10; + // equivalent to Thu, 09 Dec 2021 19:17:54 GMT + private static final Timestamp UPLOAD_TIME = Timestamp.from(Instant.ofEpochMilli(1639077474000L)); + private static final int MAX_PARTS_PER_FILE = 42; + + private AmazonS3Client s3Client; + private JdbcDatabase db; + private SqlOperations sqlOperations; + private S3StreamCopier copier; + + private MockedConstruction csvWriterMockedConstruction; + private List csvWriterConstructorArguments; + + private List copyArguments; + + private record S3CsvWriterArguments(S3DestinationConfig config, + ConfiguredAirbyteStream stream, + Timestamp uploadTime, + int uploadThreads, + int queueCapacity, + boolean writeHeader, + CSVFormat csvSettings, + CsvSheetGenerator csvSheetGenerator) { + + } + + private record CopyArguments(JdbcDatabase database, + String s3FileLocation, + String schema, + String tableName, + S3DestinationConfig s3Config) { + + } + + @BeforeEach + public void setup() { + s3Client = mock(AmazonS3Client.class); + db = mock(JdbcDatabase.class); + sqlOperations = mock(SqlOperations.class); + + csvWriterConstructorArguments = new ArrayList<>(); + copyArguments = new ArrayList<>(); + + // This is basically RETURNS_SELF, except with getMultiPartOutputStreams configured correctly. + // Other non-void methods (e.g. toString()) will return null. + csvWriterMockedConstruction = mockConstruction( + S3CsvWriter.class, + (mock, context) -> { + // Normally, the S3CsvWriter would return a path that ends in a UUID, but this mock will generate an + // int ID to make our asserts easier. + doReturn(String.format("fakeOutputPath-%05d", csvWriterConstructorArguments.size())).when(mock).getOutputPath(); + + // Mockito doesn't seem to provide an easy way to actually retrieve these arguments later on, so + // manually store them on construction. + // _PowerMockito_ does, but I didn't want to set up that additional dependency. + final List arguments = context.arguments(); + csvWriterConstructorArguments.add(new S3CsvWriterArguments( + (S3DestinationConfig) arguments.get(0), + (ConfiguredAirbyteStream) arguments.get(2), + (Timestamp) arguments.get(3), + (int) arguments.get(4), + (int) arguments.get(5), + (boolean) arguments.get(6), + (CSVFormat) arguments.get(7), + (CsvSheetGenerator) arguments.get(8))); + }); + + copier = new S3StreamCopier( + // In reality, this is normally a UUID - see CopyConsumerFactory#createWriteConfigs + "fake-staging-folder", + "fake-schema", + s3Client, + db, + new S3CopyConfig(true, S3_CONFIG), + new StandardNameTransformer(), + sqlOperations, + CONFIGURED_STREAM, + UPLOAD_TIME, + MAX_PARTS_PER_FILE) { + + @Override + public void copyS3CsvFileIntoTable( + final JdbcDatabase database, + final String s3FileLocation, + final String schema, + final String tableName, + final S3DestinationConfig s3Config) { + copyArguments.add(new CopyArguments(database, s3FileLocation, schema, tableName, s3Config)); + } + + }; + } + + @AfterEach + public void teardown() { + csvWriterMockedConstruction.close(); + } + + @Test + public void createSequentialStagingFiles_when_multipleFilesRequested() { + // When we call prepareStagingFile() the first time, it should create exactly one S3CsvWriter. The + // next (MAX_PARTS_PER_FILE - 1) invocations + // should reuse that same writer. + for (var i = 0; i < MAX_PARTS_PER_FILE; i++) { + final String file = copier.prepareStagingFile(); + assertEquals("fakeOutputPath-00000", file, "preparing file number " + i); + assertEquals(1, csvWriterMockedConstruction.constructed().size()); + checkCsvWriterArgs(csvWriterConstructorArguments.get(0)); + } + + // Now that we've hit the MAX_PARTS_PER_FILE, we should start a new writer + final String secondFile = copier.prepareStagingFile(); + assertEquals("fakeOutputPath-00001", secondFile); + final List secondManagers = csvWriterMockedConstruction.constructed(); + assertEquals(2, secondManagers.size()); + checkCsvWriterArgs(csvWriterConstructorArguments.get(1)); + } + + private void checkCsvWriterArgs(final S3CsvWriterArguments args) { + final S3DestinationConfig s3Config = S3DestinationConfig.create(S3_CONFIG) + .withFormatConfig(new S3CsvFormatConfig(null, CompressionType.NO_COMPRESSION)) + .get(); + assertEquals(s3Config, args.config); + assertEquals(CONFIGURED_STREAM, args.stream); + assertEquals(UPLOAD_TIME, args.uploadTime); + assertEquals(UPLOAD_THREADS, args.uploadThreads); + assertEquals(QUEUE_CAPACITY, args.queueCapacity); + assertFalse(args.writeHeader); + assertEquals(CSVFormat.DEFAULT, args.csvSettings); + assertTrue( + args.csvSheetGenerator instanceof StagingDatabaseCsvSheetGenerator, + "Sheet generator was actually a " + args.csvSheetGenerator.getClass()); + } + + @Test + public void closesS3Upload_when_stagingUploaderClosedSuccessfully() throws Exception { + copier.prepareStagingFile(); + + copier.closeStagingUploader(false); + + final List managers = csvWriterMockedConstruction.constructed(); + final S3CsvWriter manager = managers.get(0); + verify(manager).close(false); + } + + @Test + public void closesS3Upload_when_stagingUploaderClosedFailingly() throws Exception { + copier.prepareStagingFile(); + + copier.closeStagingUploader(true); + + final List managers = csvWriterMockedConstruction.constructed(); + final S3CsvWriter manager = managers.get(0); + verify(manager).close(true); + } + + @Test + public void deletesStagingFiles() throws Exception { + copier.prepareStagingFile(); + doReturn(true).when(s3Client).doesObjectExist("fake-bucket", "fakeOutputPath-00000"); + + copier.removeFileAndDropTmpTable(); + + verify(s3Client).deleteObject("fake-bucket", "fakeOutputPath-00000"); + } + + @Test + public void doesNotDeleteStagingFiles_if_purgeStagingDataDisabled() throws Exception { + copier = new S3StreamCopier( + "fake-staging-folder", + "fake-schema", + s3Client, + db, + // Explicitly disable purgeStagingData + new S3CopyConfig(false, S3_CONFIG), + new StandardNameTransformer(), + sqlOperations, + CONFIGURED_STREAM, + UPLOAD_TIME, + MAX_PARTS_PER_FILE) { + + @Override + public void copyS3CsvFileIntoTable( + final JdbcDatabase database, + final String s3FileLocation, + final String schema, + final String tableName, + final S3DestinationConfig s3Config) { + copyArguments.add(new CopyArguments(database, s3FileLocation, schema, tableName, s3Config)); + } + + }; + + copier.prepareStagingFile(); + doReturn(true).when(s3Client).doesObjectExist("fake-bucket", "fakeOutputPath-00000"); + + copier.removeFileAndDropTmpTable(); + + verify(s3Client, never()).deleteObject("fake-bucket", "fakeOutputPath-00000"); + } + + @Test + public void copiesCorrectFilesToTable() throws Exception { + // Generate two files + for (int i = 0; i < MAX_PARTS_PER_FILE + 1; i++) { + copier.prepareStagingFile(); + } + + copier.copyStagingFileToTemporaryTable(); + + assertEquals(2, copyArguments.size(), "Number of invocations was actually " + copyArguments.size() + ". Arguments were " + copyArguments); + + // S3StreamCopier operates on these from a HashMap, so need to sort them in order to assert in a + // sane way. + final List sortedArgs = copyArguments.stream().sorted(Comparator.comparing(arg -> arg.s3FileLocation)).toList(); + for (int i = 0; i < sortedArgs.size(); i++) { + LOGGER.info("Checking arguments for index {}", i); + final CopyArguments args = sortedArgs.get(i); + assertEquals(String.format("s3://fake-bucket/fakeOutputPath-%05d", i), args.s3FileLocation); + assertEquals("fake-schema", args.schema); + assertTrue(args.tableName.endsWith("fake_stream"), "Table name was actually " + args.tableName); + } + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/staging/StagingConsumerFactoryTest.java b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/staging/StagingConsumerFactoryTest.java new file mode 100644 index 0000000000000..3bbe304b81703 --- /dev/null +++ b/airbyte-integrations/bases/bases-destination-jdbc-async/src/test/java/io/airbyte/integrations/destination/staging/StagingConsumerFactoryTest.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2023 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.staging; + +import static org.junit.jupiter.api.Assertions.*; + +import io.airbyte.commons.exceptions.ConfigErrorException; +import io.airbyte.integrations.destination.jdbc.WriteConfig; +import java.util.List; +import org.junit.jupiter.api.Test; + +class StagingConsumerFactoryTest { + + @Test() + void detectConflictingStreams() { + final StagingConsumerFactory f = new StagingConsumerFactory(); + + final ConfigErrorException configErrorException = assertThrows( + ConfigErrorException.class, + () -> f.flushBufferFunction( + null, + null, + List.of( + new WriteConfig("example_stream", "source_schema", "destination_default_schema", null, null, null), + new WriteConfig("example_stream", "source_schema", "destination_default_schema", null, null, null)), + null)); + + assertEquals( + "You are trying to write multiple streams to the same table. Consider switching to a custom namespace format using ${SOURCE_NAMESPACE}, or moving one of them into a separate connection with a different stream prefix. Affected streams: source_schema.example_stream, source_schema.example_stream", + configErrorException.getMessage()); + } + +} diff --git a/airbyte-integrations/bases/bases-destination-jdbc/build.gradle b/airbyte-integrations/bases/bases-destination-jdbc/build.gradle index 57c98181a3e96..844a4e86f98ea 100644 --- a/airbyte-integrations/bases/bases-destination-jdbc/build.gradle +++ b/airbyte-integrations/bases/bases-destination-jdbc/build.gradle @@ -9,7 +9,7 @@ dependencies { implementation 'com.google.auth:google-auth-library-oauth2-http:0.25.5' implementation project(':airbyte-db:db-lib') - implementation project(':airbyte-integrations:bases:base-java') + implementation project(':airbyte-integrations:bases:base-java-async') implementation project(':airbyte-integrations:bases:base-java-s3') implementation libs.airbyte.protocol @@ -29,6 +29,6 @@ dependencies { integrationTestJavaImplementation project(':airbyte-integrations:bases:standard-destination-test') integrationTestJavaImplementation libs.connectors.testcontainers.postgresql - implementation files(project(':airbyte-integrations:bases:base-java').airbyteDocker.outputs) + implementation files(project(':airbyte-integrations:bases:base-java-async').airbyteDocker.outputs) integrationTestJavaImplementation files(project(':airbyte-integrations:bases:base-normalization').airbyteDocker.outputs) } diff --git a/airbyte-integrations/connectors/destination-snowflake/Dockerfile b/airbyte-integrations/connectors/destination-snowflake/Dockerfile index b930102bfc6d8..48d82a670d41a 100644 --- a/airbyte-integrations/connectors/destination-snowflake/Dockerfile +++ b/airbyte-integrations/connectors/destination-snowflake/Dockerfile @@ -20,5 +20,5 @@ RUN tar xf ${APPLICATION}.tar --strip-components=1 ENV ENABLE_SENTRY true -LABEL io.airbyte.version=1.0.1 +LABEL io.airbyte.version=1.0.2 LABEL io.airbyte.name=airbyte/destination-snowflake diff --git a/airbyte-integrations/connectors/destination-snowflake/build.gradle b/airbyte-integrations/connectors/destination-snowflake/build.gradle index e442c661adb85..a2830f04e9aa8 100644 --- a/airbyte-integrations/connectors/destination-snowflake/build.gradle +++ b/airbyte-integrations/connectors/destination-snowflake/build.gradle @@ -37,8 +37,8 @@ dependencies { implementation project(':airbyte-config-oss:config-models-oss') implementation project(':airbyte-db:db-lib') - implementation project(':airbyte-integrations:bases:base-java') - implementation project(':airbyte-integrations:bases:bases-destination-jdbc') + implementation project(':airbyte-integrations:bases:base-java-async') + implementation project(':airbyte-integrations:bases:bases-destination-jdbc-async') implementation project(':airbyte-integrations:connectors:destination-gcs') implementation project(':airbyte-integrations:bases:base-java-s3') implementation libs.airbyte.protocol diff --git a/airbyte-integrations/connectors/destination-snowflake/metadata.yaml b/airbyte-integrations/connectors/destination-snowflake/metadata.yaml index 3d53abcaaeed1..b009737de18ae 100644 --- a/airbyte-integrations/connectors/destination-snowflake/metadata.yaml +++ b/airbyte-integrations/connectors/destination-snowflake/metadata.yaml @@ -2,7 +2,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 424892c4-daac-4491-b35d-c6688ba547ba - dockerImageTag: 1.0.1 + dockerImageTag: 1.0.2 dockerRepository: airbyte/destination-snowflake githubIssueLabel: destination-snowflake icon: snowflake.svg diff --git a/docs/integrations/destinations/bigquery-denormalized.md b/docs/integrations/destinations/bigquery-denormalized.md index df1f18b7e5522..5651f33ed61af 100644 --- a/docs/integrations/destinations/bigquery-denormalized.md +++ b/docs/integrations/destinations/bigquery-denormalized.md @@ -7,7 +7,7 @@ See [destinations/bigquery](https://docs.airbyte.com/integrations/destinations/b ### bigquery-denormalized | Version | Date | Pull Request | Subject | -| :------ | :--------- | :--------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------- | +|:--------|:-----------|:-----------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------| | 1.4.0 | 2023-04-28 | [\#25570](https://github.com/airbytehq/airbyte/pull/25570) | Fix: all integer schemas should be converted to Avro longs | | 1.3.3 | 2023-04-27 | [\#25346](https://github.com/airbytehq/airbyte/pull/25346) | Internal code cleanup | | 1.3.0 | 2023-04-19 | [\#25287](https://github.com/airbytehq/airbyte/pull/25287) | Add parameter to configure the number of file buffers when GCS is used as the loading method | diff --git a/docs/integrations/destinations/bigquery.md b/docs/integrations/destinations/bigquery.md index d66df8c872ba1..c631110d0f11a 100644 --- a/docs/integrations/destinations/bigquery.md +++ b/docs/integrations/destinations/bigquery.md @@ -95,7 +95,7 @@ Airbyte converts any invalid characters into `_` characters when writing data. H ## Data type map | Airbyte type | BigQuery type | BigQuery denormalized type | -| :---------------------------------- | :------------ | :------------------------- | +|:------------------------------------|:--------------|:---------------------------| | DATE | DATE | DATE | | STRING (BASE64) | STRING | STRING | | NUMBER | FLOAT | NUMBER | @@ -134,7 +134,7 @@ Now that you have set up the BigQuery destination connector, check out the follo ### bigquery | Version | Date | Pull Request | Subject | -| :------ | :--------- | :--------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------- | +|:--------|:-----------|:-----------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------| | 1.4.0 | 2023-04-29 | [\#25570](https://github.com/airbytehq/airbyte/pull/25570) | Internal library update. Bumping version to stay in sync with BigQuery-denormalized. | | 1.3.4 | 2023-04-28 | [\#25588](https://github.com/airbytehq/airbyte/pull/25588) | Internal scaffolding change for future development | | 1.3.3 | 2023-04-27 | [\#25346](https://github.com/airbytehq/airbyte/pull/25346) | Internal code cleanup | diff --git a/docs/integrations/destinations/gcs.md b/docs/integrations/destinations/gcs.md index c7589aae95cc1..686babe3c08e3 100644 --- a/docs/integrations/destinations/gcs.md +++ b/docs/integrations/destinations/gcs.md @@ -10,24 +10,24 @@ The Airbyte GCS destination allows you to sync data to cloud storage buckets. Ea #### Features -| Feature | Support | Notes | -| :--- | :---: | :--- | -| Full Refresh Sync | ✅ | Warning: this mode deletes all previously synced data in the configured bucket path. | -| Incremental - Append Sync | ✅ | | -| Incremental - Deduped History | ❌ | As this connector does not support dbt, we don't support this sync mode on this destination. | -| Namespaces | ❌ | Setting a specific bucket path is equivalent to having separate namespaces. | +| Feature | Support | Notes | +|:------------------------------|:-------:|:---------------------------------------------------------------------------------------------| +| Full Refresh Sync | ✅ | Warning: this mode deletes all previously synced data in the configured bucket path. | +| Incremental - Append Sync | ✅ | | +| Incremental - Deduped History | ❌ | As this connector does not support dbt, we don't support this sync mode on this destination. | +| Namespaces | ❌ | Setting a specific bucket path is equivalent to having separate namespaces. | ## Configuration -| Parameter | Type | Notes | -| :--- | :---: | :--- | -| GCS Bucket Name | string | Name of the bucket to sync data into. | -| GCS Bucket Path | string | Subdirectory under the above bucket to sync the data into. | -| GCS Region | string | See [here](https://cloud.google.com/storage/docs/locations) for all region codes. | -| HMAC Key Access ID | string | HMAC key access ID . The access ID for the GCS bucket. When linked to a service account, this ID is 61 characters long; when linked to a user account, it is 24 characters long. See [HMAC key](https://cloud.google.com/storage/docs/authentication/hmackeys) for details. | -| HMAC Key Secret | string | The corresponding secret for the access ID. It is a 40-character base-64 encoded string. | -| Format | object | Format specific configuration. See below [for details](https://docs.airbyte.com/integrations/destinations/gcs#output-schema). | -| Part Size | integer | Arg to configure a block size. Max allowed blocks by GCS = 10,000, i.e. max stream size = blockSize \* 10,000 blocks. | +| Parameter | Type | Notes | +|:-------------------|:-------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| GCS Bucket Name | string | Name of the bucket to sync data into. | +| GCS Bucket Path | string | Subdirectory under the above bucket to sync the data into. | +| GCS Region | string | See [here](https://cloud.google.com/storage/docs/locations) for all region codes. | +| HMAC Key Access ID | string | HMAC key access ID . The access ID for the GCS bucket. When linked to a service account, this ID is 61 characters long; when linked to a user account, it is 24 characters long. See [HMAC key](https://cloud.google.com/storage/docs/authentication/hmackeys) for details. | +| HMAC Key Secret | string | The corresponding secret for the access ID. It is a 40-character base-64 encoded string. | +| Format | object | Format specific configuration. See below [for details](https://docs.airbyte.com/integrations/destinations/gcs#output-schema). | +| Part Size | integer | Arg to configure a block size. Max allowed blocks by GCS = 10,000, i.e. max stream size = blockSize \* 10,000 blocks. | Currently, only the [HMAC key](https://cloud.google.com/storage/docs/authentication/hmackeys) is supported. More credential types will be added in the future, please [submit an issue](https://github.com/airbytehq/airbyte/issues/new?assignees=&labels=type%2Fenhancement%2C+needs-triage&template=feature-request.md&title=) with your request. @@ -108,12 +108,12 @@ Under the hood, an Airbyte data stream in Json schema is first converted to an A Like most of the other Airbyte destination connectors, usually the output has three columns: a UUID, an emission timestamp, and the data blob. With the CSV output, it is possible to normalize \(flatten\) the data blob to multiple columns. -| Column | Condition | Description | -| :--- | :--- | :--- | -| `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | -| `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | -| `_airbyte_data` | When no normalization \(flattening\) is needed, all data reside under this column as a json blob. | | -| root level fields | When root level normalization \(flattening\) is selected, the root level fields are expanded. | | +| Column | Condition | Description | +|:----------------------|:--------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------| +| `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | +| `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | +| `_airbyte_data` | When no normalization \(flattening\) is needed, all data reside under this column as a json blob. | | +| root level fields | When root level normalization \(flattening\) is selected, the root level fields are expanded. | | For example, given the following json object from a source: @@ -129,15 +129,15 @@ For example, given the following json object from a source: With no normalization, the output CSV is: -| `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | -| :--- | :--- | :--- | -| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | +| `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | +|:---------------------------------------|:----------------------|:---------------------------------------------------------------| +| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | With root level normalization, the output CSV is: -| `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | -| :--- | :--- | :--- | :--- | -| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | +| `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | +|:---------------------------------------|:----------------------|:----------|:-------------------------------------| +| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | Output files can be compressed. The default option is GZIP compression. If compression is selected, the output filename will have an extra extension (GZIP: `.csv.gz`). @@ -189,14 +189,14 @@ Output files can be compressed. The default option is GZIP compression. If compr The following configuration is available to configure the Parquet output: -| Parameter | Type | Default | Description | -| :--- | :---: | :---: | :--- | -| `compression_codec` | enum | `UNCOMPRESSED` | **Compression algorithm**. Available candidates are: `UNCOMPRESSED`, `SNAPPY`, `GZIP`, `LZO`, `BROTLI`, `LZ4`, and `ZSTD`. | -| `block_size_mb` | integer | 128 \(MB\) | **Block size \(row group size\)** in MB. This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. | -| `max_padding_size_mb` | integer | 8 \(MB\) | **Max padding size** in MB. This is the maximum size allowed as padding to align row groups. This is also the minimum size of a row group. | -| `page_size_kb` | integer | 1024 \(KB\) | **Page size** in KB. The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. | -| `dictionary_page_size_kb` | integer | 1024 \(KB\) | **Dictionary Page Size** in KB. There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. | -| `dictionary_encoding` | boolean | `true` | **Dictionary encoding**. This parameter controls whether dictionary encoding is turned on. | +| Parameter | Type | Default | Description | +|:--------------------------|:-------:|:--------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `compression_codec` | enum | `UNCOMPRESSED` | **Compression algorithm**. Available candidates are: `UNCOMPRESSED`, `SNAPPY`, `GZIP`, `LZO`, `BROTLI`, `LZ4`, and `ZSTD`. | +| `block_size_mb` | integer | 128 \(MB\) | **Block size \(row group size\)** in MB. This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. | +| `max_padding_size_mb` | integer | 8 \(MB\) | **Max padding size** in MB. This is the maximum size allowed as padding to align row groups. This is also the minimum size of a row group. | +| `page_size_kb` | integer | 1024 \(KB\) | **Page size** in KB. The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. | +| `dictionary_page_size_kb` | integer | 1024 \(KB\) | **Dictionary Page Size** in KB. There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. | +| `dictionary_encoding` | boolean | `true` | **Dictionary encoding**. This parameter controls whether dictionary encoding is turned on. | These parameters are related to the `ParquetOutputFormat`. See the [Java doc](https://www.javadoc.io/doc/org.apache.parquet/parquet-hadoop/1.12.0/org/apache/parquet/hadoop/ParquetOutputFormat.html) for more details. Also see [Parquet documentation](https://parquet.apache.org/docs/file-format/configurations) for their recommended configurations \(512 - 1024 MB block size, 8 KB page size\). @@ -235,37 +235,38 @@ Under the hood, an Airbyte data stream in Json schema is first converted to an A ## CHANGELOG -| Version | Date | Pull Request | Subject | -|:--------| :--- |:------------------------------------------------------------| :--- | -| 0.3.0 | 2023-04-28 | [#25570](https://github.com/airbytehq/airbyte/pull/25570) | Fix: all integer schemas should be converted to Avro longs | -| 0.2.17 | 2023-04-27 | [#25346](https://github.com/airbytehq/airbyte/pull/25346) | Internal code cleanup | -| 0.2.16 | 2023-03-17 | [#23788](https://github.com/airbytehq/airbyte/pull/23788) | S3-Parquet: added handler to process null values in arrays | -| 0.2.15 | 2023-03-10 | [#23466](https://github.com/airbytehq/airbyte/pull/23466) | Changed S3 Avro type from Int to Long | -| 0.2.14 | 2023-11-23 | [\#21682](https://github.com/airbytehq/airbyte/pull/21682) | Add support for buckets with Customer-Managed Encryption Key | -| 0.2.13 | 2023-01-18 | [#21087](https://github.com/airbytehq/airbyte/pull/21087) | Wrap Authentication Errors as Config Exceptions | -| 0.2.12 | 2022-10-18 | [\#17901](https://github.com/airbytehq/airbyte/pull/17901) | Fix logging to GCS | -| 0.2.11 | 2022-09-01 | [\#16243](https://github.com/airbytehq/airbyte/pull/16243) | Fix Json to Avro conversion when there is field name clash from combined restrictions (`anyOf`, `oneOf`, `allOf` fields) | -| 0.2.10 | 2022-08-05 | [\#14801](https://github.com/airbytehq/airbyte/pull/14801) | Fix multiple log bindings | -| 0.2.9 | 2022-06-24 | [\#14114](https://github.com/airbytehq/airbyte/pull/14114) | Remove "additionalProperties": false from specs for connectors with staging | -| 0.2.8 | 2022-06-17 | [\#13753](https://github.com/airbytehq/airbyte/pull/13753) | Deprecate and remove PART_SIZE_MB fields from connectors based on StreamTransferManager | -| 0.2.7 | 2022-06-14 | [\#13483](https://github.com/airbytehq/airbyte/pull/13483) | Added support for int, long, float data types to Avro/Parquet formats. | -| 0.2.6 | 2022-05-17 | [12820](https://github.com/airbytehq/airbyte/pull/12820) | Improved 'check' operation performance | -| 0.2.5 | 2022-05-04 | [\#12578](https://github.com/airbytehq/airbyte/pull/12578) | In JSON to Avro conversion, log JSON field values that do not follow Avro schema for debugging. | -| 0.2.4 | 2022-04-22 | [\#12167](https://github.com/airbytehq/airbyte/pull/12167) | Add gzip compression option for CSV and JSONL formats. | -| 0.2.3 | 2022-04-22 | [\#11795](https://github.com/airbytehq/airbyte/pull/11795) | Fix the connection check to verify the provided bucket path. | -| 0.2.2 | 2022-04-05 | [\#11728](https://github.com/airbytehq/airbyte/pull/11728) | Properly clean-up bucket when running OVERWRITE sync mode | -| 0.2.1 | 2022-04-05 | [\#11499](https://github.com/airbytehq/airbyte/pull/11499) | Updated spec and documentation. | -| 0.2.0 | 2022-04-04 | [\#11686](https://github.com/airbytehq/airbyte/pull/11686) | Use serialized buffering strategy to reduce memory consumption; compress CSV and JSONL formats. | -| 0.1.22 | 2022-02-12 | [\#10256](https://github.com/airbytehq/airbyte/pull/10256) | Add JVM flag to exist on OOME. | -| 0.1.21 | 2022-02-12 | [\#10299](https://github.com/airbytehq/airbyte/pull/10299) | Fix connection check to require only the necessary permissions. | -| 0.1.20 | 2022-01-11 | [\#9367](https://github.com/airbytehq/airbyte/pull/9367) | Avro & Parquet: support array field with unknown item type; default any improperly typed field to string. | -| 0.1.19 | 2022-01-10 | [\#9121](https://github.com/airbytehq/airbyte/pull/9121) | Fixed check method for GCS mode to verify if all roles assigned to user | -| 0.1.18 | 2021-12-30 | [\#8809](https://github.com/airbytehq/airbyte/pull/8809) | Update connector fields title/description | -| 0.1.17 | 2021-12-21 | [\#8574](https://github.com/airbytehq/airbyte/pull/8574) | Added namespace to Avro and Parquet record types | -| 0.1.16 | 2021-12-20 | [\#8974](https://github.com/airbytehq/airbyte/pull/8974) | Release a new version to ensure there is no excessive logging. | -| 0.1.15 | 2021-12-03 | [\#8386](https://github.com/airbytehq/airbyte/pull/8386) | Add new GCP regions | -| 0.1.14 | 2021-12-01 | [\#7732](https://github.com/airbytehq/airbyte/pull/7732) | Support timestamp in Avro and Parquet | -| 0.1.13 | 2021-11-03 | [\#7288](https://github.com/airbytehq/airbyte/issues/7288) | Support Json `additionalProperties`. | -| 0.1.2 | 2021-09-12 | [\#5720](https://github.com/airbytehq/airbyte/issues/5720) | Added configurable block size for stream. Each stream is limited to 10,000 by GCS | -| 0.1.1 | 2021-08-26 | [\#5296](https://github.com/airbytehq/airbyte/issues/5296) | Added storing gcsCsvFileLocation property for CSV format. This is used by destination-bigquery \(GCS Staging upload type\) | -| 0.1.0 | 2021-07-16 | [\#4329](https://github.com/airbytehq/airbyte/pull/4784) | Initial release. | + +| Version | Date | Pull Request | Subject | +|:--------|:-----------|:-----------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------| +| 0.3.0 | 2023-04-28 | [\#25570](https://github.com/airbytehq/airbyte/pull/25570) | Fix: all integer schemas should be converted to Avro longs | +| 0.2.17 | 2023-04-27 | [\#25346](https://github.com/airbytehq/airbyte/pull/25346) | Internal code cleanup | | +| 0.2.16 | 2023-03-17 | [\#23788](https://github.com/airbytehq/airbyte/pull/23788) | S3-Parquet: added handler to process null values in arrays | +| 0.2.15 | 2023-03-10 | [\#23466](https://github.com/airbytehq/airbyte/pull/23466) | Changed S3 Avro type from Int to Long | +| 0.2.14 | 2023-11-23 | [\#21682](https://github.com/airbytehq/airbyte/pull/21682) | Add support for buckets with Customer-Managed Encryption Key | +| 0.2.13 | 2023-01-18 | [\#21087](https://github.com/airbytehq/airbyte/pull/21087) | Wrap Authentication Errors as Config Exceptions | +| 0.2.12 | 2022-10-18 | [\#17901](https://github.com/airbytehq/airbyte/pull/17901) | Fix logging to GCS | +| 0.2.11 | 2022-09-01 | [\#16243](https://github.com/airbytehq/airbyte/pull/16243) | Fix Json to Avro conversion when there is field name clash from combined restrictions (`anyOf`, `oneOf`, `allOf` fields) | +| 0.2.10 | 2022-08-05 | [\#14801](https://github.com/airbytehq/airbyte/pull/14801) | Fix multiple log bindings | +| 0.2.9 | 2022-06-24 | [\#14114](https://github.com/airbytehq/airbyte/pull/14114) | Remove "additionalProperties": false from specs for connectors with staging | +| 0.2.8 | 2022-06-17 | [\#13753](https://github.com/airbytehq/airbyte/pull/13753) | Deprecate and remove PART_SIZE_MB fields from connectors based on StreamTransferManager | +| 0.2.7 | 2022-06-14 | [\#13483](https://github.com/airbytehq/airbyte/pull/13483) | Added support for int, long, float data types to Avro/Parquet formats. | +| 0.2.6 | 2022-05-17 | [\#12820](https://github.com/airbytehq/airbyte/pull/12820) | Improved 'check' operation performance | +| 0.2.5 | 2022-05-04 | [\#12578](https://github.com/airbytehq/airbyte/pull/12578) | In JSON to Avro conversion, log JSON field values that do not follow Avro schema for debugging. | +| 0.2.4 | 2022-04-22 | [\#12167](https://github.com/airbytehq/airbyte/pull/12167) | Add gzip compression option for CSV and JSONL formats. | +| 0.2.3 | 2022-04-22 | [\#11795](https://github.com/airbytehq/airbyte/pull/11795) | Fix the connection check to verify the provided bucket path. | +| 0.2.2 | 2022-04-05 | [\#11728](https://github.com/airbytehq/airbyte/pull/11728) | Properly clean-up bucket when running OVERWRITE sync mode | +| 0.2.1 | 2022-04-05 | [\#11499](https://github.com/airbytehq/airbyte/pull/11499) | Updated spec and documentation. | +| 0.2.0 | 2022-04-04 | [\#11686](https://github.com/airbytehq/airbyte/pull/11686) | Use serialized buffering strategy to reduce memory consumption; compress CSV and JSONL formats. | +| 0.1.22 | 2022-02-12 | [\#10256](https://github.com/airbytehq/airbyte/pull/10256) | Add JVM flag to exist on OOME. | +| 0.1.21 | 2022-02-12 | [\#10299](https://github.com/airbytehq/airbyte/pull/10299) | Fix connection check to require only the necessary permissions. | +| 0.1.20 | 2022-01-11 | [\#9367](https://github.com/airbytehq/airbyte/pull/9367) | Avro & Parquet: support array field with unknown item type; default any improperly typed field to string. | +| 0.1.19 | 2022-01-10 | [\#9121](https://github.com/airbytehq/airbyte/pull/9121) | Fixed check method for GCS mode to verify if all roles assigned to user | +| 0.1.18 | 2021-12-30 | [\#8809](https://github.com/airbytehq/airbyte/pull/8809) | Update connector fields title/description | +| 0.1.17 | 2021-12-21 | [\#8574](https://github.com/airbytehq/airbyte/pull/8574) | Added namespace to Avro and Parquet record types | +| 0.1.16 | 2021-12-20 | [\#8974](https://github.com/airbytehq/airbyte/pull/8974) | Release a new version to ensure there is no excessive logging. | +| 0.1.15 | 2021-12-03 | [\#8386](https://github.com/airbytehq/airbyte/pull/8386) | Add new GCP regions | +| 0.1.14 | 2021-12-01 | [\#7732](https://github.com/airbytehq/airbyte/pull/7732) | Support timestamp in Avro and Parquet | +| 0.1.13 | 2021-11-03 | [\#7288](https://github.com/airbytehq/airbyte/issues/7288) | Support Json `additionalProperties`. | +| 0.1.2 | 2021-09-12 | [\#5720](https://github.com/airbytehq/airbyte/issues/5720) | Added configurable block size for stream. Each stream is limited to 10,000 by GCS | +| 0.1.1 | 2021-08-26 | [\#5296](https://github.com/airbytehq/airbyte/issues/5296) | Added storing gcsCsvFileLocation property for CSV format. This is used by destination-bigquery \(GCS Staging upload type\) | +| 0.1.0 | 2021-07-16 | [\#4329](https://github.com/airbytehq/airbyte/pull/4784) | Initial release. | diff --git a/docs/integrations/destinations/redshift.md b/docs/integrations/destinations/redshift.md index 6f8e0fcd24668..c9ad01ee85317 100644 --- a/docs/integrations/destinations/redshift.md +++ b/docs/integrations/destinations/redshift.md @@ -164,7 +164,7 @@ Each stream will be output into its own raw table in Redshift. Each table will c | 0.3.55 | 2023-01-26 | [\#20631](https://github.com/airbytehq/airbyte/pull/20631) | Added support for destination checkpointing with staging | | 0.3.54 | 2023-01-18 | [\#21087](https://github.com/airbytehq/airbyte/pull/21087) | Wrap Authentication Errors as Config Exceptions | | 0.3.53 | 2023-01-03 | [\#17273](https://github.com/airbytehq/airbyte/pull/17273) | Flatten JSON arrays to fix maximum size check for SUPER field | -| 0.3.52 | 2022-12-30 | [\#20879](https://github.com/airbytehq/airbyte/pull/20879) | Added configurable parameter for number of file buffers (⛔ this version has a bug and will not work; use `0.3.56` instead) | +| 0.3.52 | 2022-12-30 | [\#20879](https://github.com/airbytehq/airbyte/pull/20879) | Added configurable parameter for number of file buffers (⛔ this version has a bug and will not work; use `0.3.56` instead) | | 0.3.51 | 2022-10-26 | [\#18434](https://github.com/airbytehq/airbyte/pull/18434) | Fix empty S3 bucket path handling | | 0.3.50 | 2022-09-14 | [\#15668](https://github.com/airbytehq/airbyte/pull/15668) | Wrap logs in AirbyteLogMessage | | 0.3.49 | 2022-09-01 | [\#16243](https://github.com/airbytehq/airbyte/pull/16243) | Fix Json to Avro conversion when there is field name clash from combined restrictions (`anyOf`, `oneOf`, `allOf` fields) | diff --git a/docs/integrations/destinations/s3.md b/docs/integrations/destinations/s3.md index 7279a420f4f45..1a29de6a72872 100644 --- a/docs/integrations/destinations/s3.md +++ b/docs/integrations/destinations/s3.md @@ -166,12 +166,12 @@ A data sync may create multiple files as the output files can be partitioned by ## Supported sync modes -| Feature | Support | Notes | -| :--- | :---: | :--- | -| Full Refresh Sync | ✅ | Warning: this mode deletes all previously synced data in the configured bucket path. | -| Incremental - Append Sync | ✅ | | -| Incremental - Deduped History | ❌ | As this connector does not support dbt, we don't support this sync mode on this destination. | -| Namespaces | ❌ | Setting a specific bucket path is equivalent to having separate namespaces. | +| Feature | Support | Notes | +|:------------------------------|:-------:|:---------------------------------------------------------------------------------------------| +| Full Refresh Sync | ✅ | Warning: this mode deletes all previously synced data in the configured bucket path. | +| Incremental - Append Sync | ✅ | | +| Incremental - Deduped History | ❌ | As this connector does not support dbt, we don't support this sync mode on this destination. | +| Namespaces | ❌ | Setting a specific bucket path is equivalent to having separate namespaces. | The Airbyte S3 destination allows you to sync data to AWS S3 or Minio S3. Each stream is written to its own directory under the bucket. ⚠️ Please note that under "Full Refresh Sync" mode, data in the configured bucket and path will be wiped out before each sync. We recommend you to provision a dedicated S3 resource for this sync to prevent unexpected data deletion from misconfiguration. ⚠️ @@ -222,12 +222,12 @@ Under the hood, an Airbyte data stream in JSON schema is first converted to an A Like most of the other Airbyte destination connectors, usually the output has three columns: a UUID, an emission timestamp, and the data blob. With the CSV output, it is possible to normalize \(flatten\) the data blob to multiple columns. -| Column | Condition | Description | -| :--- | :--- | :--- | -| `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | -| `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | -| `_airbyte_data` | When no normalization \(flattening\) is needed, all data reside under this column as a json blob. | | -| root level fields | When root level normalization \(flattening\) is selected, the root level fields are expanded. | | +| Column | Condition | Description | +|:----------------------|:--------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------| +| `_airbyte_ab_id` | Always exists | A uuid assigned by Airbyte to each processed record. | +| `_airbyte_emitted_at` | Always exists. | A timestamp representing when the event was pulled from the data source. | +| `_airbyte_data` | When no normalization \(flattening\) is needed, all data reside under this column as a json blob. | | +| root level fields | When root level normalization \(flattening\) is selected, the root level fields are expanded. | | For example, given the following json object from a source: @@ -243,15 +243,15 @@ For example, given the following json object from a source: With no normalization, the output CSV is: -| `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | -| :--- | :--- | :--- | -| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | +| `_airbyte_ab_id` | `_airbyte_emitted_at` | `_airbyte_data` | +|:---------------------------------------|:----------------------|:---------------------------------------------------------------| +| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | `{ "user_id": 123, name: { "first": "John", "last": "Doe" } }` | With root level normalization, the output CSV is: -| `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | -| :--- | :--- | :--- | :--- | -| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | +| `_airbyte_ab_id` | `_airbyte_emitted_at` | `user_id` | `name` | +|:---------------------------------------|:----------------------|:----------|:-------------------------------------| +| `26d73cde-7eb1-4e1e-b7db-a4c03b4cf206` | 1622135805000 | 123 | `{ "first": "John", "last": "Doe" }` | Output files can be compressed. The default option is GZIP compression. If compression is selected, the output filename will have an extra extension (GZIP: `.csv.gz`). @@ -303,14 +303,14 @@ Output files can be compressed. The default option is GZIP compression. If compr The following configuration is available to configure the Parquet output: -| Parameter | Type | Default | Description | -| :--- | :---: | :---: | :--- | -| `compression_codec` | enum | `UNCOMPRESSED` | **Compression algorithm**. Available candidates are: `UNCOMPRESSED`, `SNAPPY`, `GZIP`, `LZO`, `BROTLI`, `LZ4`, and `ZSTD`. | -| `block_size_mb` | integer | 128 \(MB\) | **Block size \(row group size\)** in MB. This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. | -| `max_padding_size_mb` | integer | 8 \(MB\) | **Max padding size** in MB. This is the maximum size allowed as padding to align row groups. This is also the minimum size of a row group. | -| `page_size_kb` | integer | 1024 \(KB\) | **Page size** in KB. The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. | -| `dictionary_page_size_kb` | integer | 1024 \(KB\) | **Dictionary Page Size** in KB. There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. | -| `dictionary_encoding` | boolean | `true` | **Dictionary encoding**. This parameter controls whether dictionary encoding is turned on. | +| Parameter | Type | Default | Description | +|:--------------------------|:-------:|:--------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `compression_codec` | enum | `UNCOMPRESSED` | **Compression algorithm**. Available candidates are: `UNCOMPRESSED`, `SNAPPY`, `GZIP`, `LZO`, `BROTLI`, `LZ4`, and `ZSTD`. | +| `block_size_mb` | integer | 128 \(MB\) | **Block size \(row group size\)** in MB. This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. | +| `max_padding_size_mb` | integer | 8 \(MB\) | **Max padding size** in MB. This is the maximum size allowed as padding to align row groups. This is also the minimum size of a row group. | +| `page_size_kb` | integer | 1024 \(KB\) | **Page size** in KB. The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. | +| `dictionary_page_size_kb` | integer | 1024 \(KB\) | **Dictionary Page Size** in KB. There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. | +| `dictionary_encoding` | boolean | `true` | **Dictionary encoding**. This parameter controls whether dictionary encoding is turned on. | These parameters are related to the `ParquetOutputFormat`. See the [Java doc](https://www.javadoc.io/doc/org.apache.parquet/parquet-hadoop/1.12.0/org/apache/parquet/hadoop/ParquetOutputFormat.html) for more details. Also see [Parquet documentation](https://parquet.apache.org/docs/file-format/configurations/) for their recommended configurations \(512 - 1024 MB block size, 8 KB page size\). @@ -339,13 +339,13 @@ In order for everything to work correctly, it is also necessary that the user wh | Version | Date | Pull Request | Subject | |:--------|:-----------|:-----------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------| -| 0.4.0 | 2023-04-28 | [#25570](https://github.com/airbytehq/airbyte/pull/25570) | Fix: all integer schemas should be converted to Avro longs | -| 0.3.25 | 2023-04-27 | [#25346](https://github.com/airbytehq/airbyte/pull/25346) | Internal code cleanup | -| 0.3.23 | 2023-03-30 | [#24736](https://github.com/airbytehq/airbyte/pull/24736) | Improve behavior when throttled by AWS API | -| 0.3.22 | 2023-03-17 | [#23788](https://github.com/airbytehq/airbyte/pull/23788) | S3-Parquet: added handler to process null values in arrays | -| 0.3.21 | 2023-03-10 | [#23466](https://github.com/airbytehq/airbyte/pull/23466) | Changed S3 Avro type from Int to Long | -| 0.3.20 | 2023-02-23 | [#21355](https://github.com/airbytehq/airbyte/pull/21355) | Add root level flattening option to JSONL output. | -| 0.3.19 | 2023-01-18 | [#21087](https://github.com/airbytehq/airbyte/pull/21087) | Wrap Authentication Errors as Config Exceptions | +| 0.4.0 | 2023-04-28 | [\#25570](https://github.com/airbytehq/airbyte/pull/25570) | Fix: all integer schemas should be converted to Avro longs | +| 0.3.25 | 2023-04-27 | [\#25346](https://github.com/airbytehq/airbyte/pull/25346) | Internal code cleanup | +| 0.3.23 | 2023-03-30 | [\#24736](https://github.com/airbytehq/airbyte/pull/24736) | Improve behavior when throttled by AWS API | +| 0.3.22 | 2023-03-17 | [\#23788](https://github.com/airbytehq/airbyte/pull/23788) | S3-Parquet: added handler to process null values in arrays | +| 0.3.21 | 2023-03-10 | [\#23466](https://github.com/airbytehq/airbyte/pull/23466) | Changed S3 Avro type from Int to Long | +| 0.3.20 | 2023-02-23 | [\#21355](https://github.com/airbytehq/airbyte/pull/21355) | Add root level flattening option to JSONL output. | +| 0.3.19 | 2023-01-18 | [\#21087](https://github.com/airbytehq/airbyte/pull/21087) | Wrap Authentication Errors as Config Exceptions | | 0.3.18 | 2022-12-15 | [\#20088](https://github.com/airbytehq/airbyte/pull/20088) | New data type support v0/v1 | | 0.3.17 | 2022-10-15 | [\#18031](https://github.com/airbytehq/airbyte/pull/18031) | Fix integration tests to use bucket path | | 0.3.16 | 2022-10-03 | [\#17340](https://github.com/airbytehq/airbyte/pull/17340) | Enforced encrypted only traffic to S3 buckets and check logic | @@ -370,7 +370,7 @@ In order for everything to work correctly, it is also necessary that the user wh | 0.2.11 | 2022-03-23 | [\#11173](https://github.com/airbytehq/airbyte/pull/11173) | Added support for AWS Glue crawler | | 0.2.10 | 2022-03-07 | [\#10856](https://github.com/airbytehq/airbyte/pull/10856) | `check` method now tests for listObjects permissions on the target bucket | | 0.2.7 | 2022-02-14 | [\#10318](https://github.com/airbytehq/airbyte/pull/10318) | Prevented double slashes in S3 destination path | -| 0.2.6 | 2022-02-14 | [10256](https://github.com/airbytehq/airbyte/pull/10256) | Add `-XX:+ExitOnOutOfMemoryError` JVM option | +| 0.2.6 | 2022-02-14 | [\#10256](https://github.com/airbytehq/airbyte/pull/10256) | Add `-XX:+ExitOnOutOfMemoryError` JVM option | | 0.2.5 | 2022-01-13 | [\#9399](https://github.com/airbytehq/airbyte/pull/9399) | Use instance profile authentication if credentials are not provided | | 0.2.4 | 2022-01-12 | [\#9415](https://github.com/airbytehq/airbyte/pull/9415) | BigQuery Destination : Fix GCS processing of Facebook data | | 0.2.3 | 2022-01-11 | [\#9367](https://github.com/airbytehq/airbyte/pull/9367) | Avro & Parquet: support array field with unknown item type; default any improperly typed field to string. | diff --git a/docs/integrations/destinations/snowflake.md b/docs/integrations/destinations/snowflake.md index 6d135be05fb9f..62a6335b66ba1 100644 --- a/docs/integrations/destinations/snowflake.md +++ b/docs/integrations/destinations/snowflake.md @@ -273,6 +273,7 @@ Otherwise, make sure to grant the role the required permissions in the desired n | Version | Date | Pull Request | Subject | |:----------------|:-----------|:-----------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------| +| 1.0.2 | 2023-05-05 | [\#25649](https://github.com/airbytehq/airbyte/pull/25649) | Splits base class dependencies | | 1.0.1 | 2023-04-29 | [\#25570](https://github.com/airbytehq/airbyte/pull/25570) | Internal library update | | 1.0.0 | 2023-05-02 | [\#25739](https://github.com/airbytehq/airbyte/pull/25739) | Removed Azure Blob Storage as a loading method | | 0.4.63 | 2023-04-27 | [\#25346](https://github.com/airbytehq/airbyte/pull/25346) | Added FlushBufferFunction interface | diff --git a/settings.gradle b/settings.gradle index 543a9bd03cedd..7dad7715ab4c4 100644 --- a/settings.gradle +++ b/settings.gradle @@ -103,9 +103,11 @@ if (!System.getenv().containsKey("SUB_BUILD") || System.getenv().get("SUB_BUILD" include ':airbyte-cdk:python' include ':airbyte-integrations:bases:base' include ':airbyte-integrations:bases:base-java' + include ':airbyte-integrations:bases:base-java-async' include ':airbyte-integrations:bases:base-java-s3' include ':airbyte-integrations:bases:base-normalization' include ':airbyte-integrations:bases:bases-destination-jdbc' // needs to be lexicographically after base-java and base-normalization to avoid race condition + include ':airbyte-integrations:bases:bases-destination-jdbc-async' include ':airbyte-integrations:bases:base-standard-source-test-file' include ':airbyte-integrations:bases:connector-acceptance-test' include ':airbyte-integrations:bases:standard-destination-test'