Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -284,9 +284,13 @@ def get_openlineage_facets_on_complete(self, task_instance) -> OperatorLineage:
"""
from airflow.providers.openlineage.extractors import OperatorLineage

if isinstance(self.run_id, int) and self.wait_for_termination is True:
return generate_openlineage_events_from_dbt_cloud_run(operator=self, task_instance=task_instance)
return OperatorLineage()
if not isinstance(self.run_id, int):
self.log.info("Skipping OpenLineage event extraction: `self.run_id` is not set.")
return OperatorLineage()
if not self.wait_for_termination:
self.log.info("Skipping OpenLineage event extraction: `self.wait_for_termination` is False.")
return OperatorLineage()
return generate_openlineage_events_from_dbt_cloud_run(operator=self, task_instance=task_instance)


class DbtCloudGetJobRunArtifactOperator(BaseOperator):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,39 +17,41 @@
from __future__ import annotations

import asyncio
import logging
import re
from contextlib import suppress
from typing import TYPE_CHECKING

from packaging.version import parse

from airflow import __version__ as airflow_version
from airflow.providers.dbt.cloud.version_compat import AIRFLOW_V_2_10_PLUS, AIRFLOW_V_3_0_PLUS

if TYPE_CHECKING:
from packaging.version import Version

from airflow.models.taskinstance import TaskInstance
from airflow.providers.dbt.cloud.operators.dbt import DbtCloudRunJobOperator
from airflow.providers.dbt.cloud.sensors.dbt import DbtCloudJobRunSensor
from airflow.providers.openlineage.extractors.base import OperatorLineage


_AIRFLOW_VERSION: Version = parse(parse(airflow_version).base_version)
log = logging.getLogger(__name__)


def _get_logical_date(task_instance):
# todo: remove when min airflow version >= 3.0
if parse("3") > _AIRFLOW_VERSION:
return task_instance.execution_date
return task_instance.logical_date
if AIRFLOW_V_3_0_PLUS:
dagrun = task_instance.get_template_context()["dag_run"]
return dagrun.logical_date or dagrun.run_after

if hasattr(task_instance, "logical_date"):
date = task_instance.logical_date
else:
date = task_instance.execution_date

return date


def _get_try_number(val):
# todo: remove when min airflow version >= 2.10.0
if parse("2.10.0") > _AIRFLOW_VERSION:
return val.try_number - 1
else:
if AIRFLOW_V_2_10_PLUS:
return val.try_number
return val.try_number - 1


def generate_openlineage_events_from_dbt_cloud_run(
Expand Down Expand Up @@ -87,6 +89,7 @@ def generate_openlineage_events_from_dbt_cloud_run(
from airflow.providers.openlineage.plugins.listener import get_openlineage_listener

# if no account_id set this will fallback
log.debug("Retrieving information about DBT job run.")
job_run = operator.hook.get_job_run(
run_id=operator.run_id, account_id=operator.account_id, include_related=["run_steps,job"]
).json()["data"]
Expand All @@ -98,6 +101,7 @@ def generate_openlineage_events_from_dbt_cloud_run(
execute_steps = job["execute_steps"]
run_steps = job_run["run_steps"]

log.debug("Filtering only DBT invocation steps for further processing.")
# filter only dbt invocation steps
steps = []
for run_step in run_steps:
Expand All @@ -110,8 +114,15 @@ def generate_openlineage_events_from_dbt_cloud_run(

# catalog is available only if docs are generated
catalog = None
with suppress(Exception):
try:
log.debug("Retrieving information about catalog artifact from DBT.")
catalog = operator.hook.get_job_run_artifact(operator.run_id, path="catalog.json").json()["data"]
except Exception: # type: ignore
log.info(
"Openlineage could not find DBT catalog artifact, usually available when docs are generated."
"Proceeding with metadata extraction. "
"If you see error logs above about `HTTP error: Not Found` it's safe to ignore them."
)

async def get_artifacts_for_steps(steps, artifacts):
"""Get artifacts for a list of steps concurrently."""
Expand All @@ -127,16 +138,37 @@ async def get_artifacts_for_steps(steps, artifacts):
return await asyncio.gather(*tasks)

# get artifacts for steps concurrently
log.debug("Retrieving information about artifacts for all job steps from DBT.")
step_artifacts = asyncio.run(
get_artifacts_for_steps(steps=steps, artifacts=["manifest.json", "run_results.json"])
)

log.debug("Preparing OpenLineage parent job information to be included in DBT events.")
# generate same run id of current task instance
parent_run_id = OpenLineageAdapter.build_task_instance_run_id(
dag_id=task_instance.dag_id,
task_id=operator.task_id,
logical_date=_get_logical_date(task_instance),
try_number=_get_try_number(task_instance),
map_index=task_instance.map_index,
)

parent_job = ParentRunMetadata(
run_id=parent_run_id,
job_name=f"{task_instance.dag_id}.{task_instance.task_id}",
job_namespace=namespace(),
)
client = get_openlineage_listener().adapter.get_or_create_openlineage_client()

# process each step in loop, sending generated events in the same order as steps
for artifacts in step_artifacts:
for counter, artifacts in enumerate(step_artifacts, 1):
log.debug("Parsing information about artifact no. %s.", counter)

# process manifest
manifest = artifacts["manifest.json"]

if not artifacts.get("run_results.json", None):
log.debug("No run results found for artifact no. %s. Skipping.", counter)
continue

processor = DbtCloudArtifactProcessor(
Expand All @@ -150,26 +182,14 @@ async def get_artifacts_for_steps(steps, artifacts):
catalog=catalog,
)

# generate same run id of current task instance
parent_run_id = OpenLineageAdapter.build_task_instance_run_id(
dag_id=task_instance.dag_id,
task_id=operator.task_id,
logical_date=_get_logical_date(task_instance),
try_number=_get_try_number(task_instance),
map_index=task_instance.map_index,
)

parent_job = ParentRunMetadata(
run_id=parent_run_id,
job_name=f"{task_instance.dag_id}.{task_instance.task_id}",
job_namespace=namespace(),
)
processor.dbt_run_metadata = parent_job

events = processor.parse().events()

client = get_openlineage_listener().adapter.get_or_create_openlineage_client()
log.debug("Found %s OpenLineage events for artifact no. %s.", len(events), counter)

for event in events:
client.emit(event=event)
log.debug("Emitted all OpenLineage events for artifact no. %s.", counter)

log.info("OpenLineage has successfully finished processing information about DBT job run.")
return OperatorLineage()
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# NOTE! THIS FILE IS COPIED MANUALLY IN OTHER PROVIDERS DELIBERATELY TO AVOID ADDING UNNECESSARY
# DEPENDENCIES BETWEEN PROVIDERS. IF YOU WANT TO ADD CONDITIONAL CODE IN YOUR PROVIDER THAT DEPENDS
# ON AIRFLOW VERSION, PLEASE COPY THIS FILE TO THE ROOT PACKAGE OF YOUR PROVIDER AND IMPORT
# THOSE CONSTANTS FROM IT RATHER THAN IMPORTING THEM FROM ANOTHER PROVIDER OR TEST CODE
#
from __future__ import annotations


def get_base_airflow_version_tuple() -> tuple[int, int, int]:
from packaging.version import Version

from airflow import __version__

airflow_version = Version(__version__)
return airflow_version.major, airflow_version.minor, airflow_version.micro


AIRFLOW_V_2_10_PLUS = get_base_airflow_version_tuple() >= (2, 10, 0)
AIRFLOW_V_3_0_PLUS = get_base_airflow_version_tuple() >= (3, 0, 0)
1 change: 1 addition & 0 deletions tests/always/test_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def test_providers_modules_should_have_tests(self):
"providers/common/messaging/tests/unit/common/messaging/providers/test_base_provider.py",
"providers/common/messaging/tests/unit/common/messaging/providers/test_sqs.py",
"providers/databricks/tests/unit/databricks/test_version_compat.py",
"providers/dbt/cloud/tests/unit/dbt/cloud/test_version_compat.py",
"providers/edge/tests/unit/edge/models/test_edge_job.py",
"providers/edge/tests/unit/edge/models/test_edge_logs.py",
"providers/edge/tests/unit/edge/models/test_edge_worker.py",
Expand Down