script/publish_persistent_udfs

#!/usr/bin/env python3

"""
This script publishes all user-defined functions in a directory as persistent
UDFs in the {dirname} dataset.

The {dirname}_ prefix will be stripped from names of published UDFs.
"""

from argparse import ArgumentParser
import os
import sys
import re

from google.cloud import bigquery
from gcloud import storage

# sys.path needs to be modified to enable package imports from parent
# and sibling directories. Also see:
# https://stackoverflow.com/questions/6323860/sibling-package-imports/23542795#23542795
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from bigquery_etl.parse_udf import read_udf_dirs, accumulate_dependencies  # noqa E402


UDF_RE = re.compile(r"udf_(?:js_|legacy_)?([a-zA-z0-9_]+)")

OPTIONS_LIB_RE = re.compile(r'library = "gs://[^"]+/([^"]+)"')

parser = ArgumentParser(description=__doc__)
parser.add_argument(
    "--project-id", default="moz-fx-data-derived-datasets", help="The project ID."
)
parser.add_argument(
    "--dataset",
    default="udf",
    help="The name of the dataset the persistent UDFs will be stored in.",
)
parser.add_argument(
    "--udf-dir",
    default="udf/",
    help="The directory where declarations of temporary UDFs are stored.",
)
parser.add_argument(
    "--gcs-bucket",
    default="moz-fx-data-prod-bigquery-etl",
    help="The GCS bucket where dependency files are uploaded to.",
)
parser.add_argument(
    "--gcs-path",
    default="",
    help="The GCS path in the bucket where dependency files are uploaded to.",
)
parser.add_argument(
    "--dep-dir",
    default="udf/lib/",
    help="The directory JavaScript dependency files for UDFs are stored.",
)


def main():
    args = parser.parse_args()

    raw_udfs = read_udf_dirs(args.udf_dir)
    published_udfs = []
    client = bigquery.Client(args.project_id)

    if args.dep_dir:
        push_dependencies_to_gcs(args.gcs_bucket, args.gcs_path, args.dep_dir)

    for raw_udf in raw_udfs:
        # get all dependencies for UDF and publish as persistent UDF
        udfs_to_publish = accumulate_dependencies([], raw_udfs, raw_udf)
        udfs_to_publish.append(raw_udf)
        for dep in udfs_to_publish:
            if dep not in published_udfs:
                publish_persistent_udf(
                    raw_udfs[dep],
                    client,
                    args.dataset,
                    args.project_id,
                    args.gcs_bucket,
                    args.gcs_path,
                )
                published_udfs.append(dep)


def publish_persistent_udf(raw_udf, client, dataset, project_id, gcs_bucket, gcs_path):
    # transforms temporary UDF to persistent UDFs and publishes them
    for definition in raw_udf.definitions:
        # Within a standard SQL function, references to other entities require
        # explicit project IDs
        query_with_renamed_udfs = UDF_RE.sub(
            "`" + project_id + "`." + dataset + "." + r"\1", definition
        )

        query_with_renamed_udfs = query_with_renamed_udfs.replace(
            "CREATE TEMP FUNCTION", "CREATE OR REPLACE FUNCTION"
        )

        # adjust paths for dependencies stored in GCS
        query = OPTIONS_LIB_RE.sub(
            fr'library = "gs://{gcs_bucket}/{gcs_path}\1"', query_with_renamed_udfs
        )

        client.query(query).result()


def push_dependencies_to_gcs(bucket, path, dep_dir):
    client = storage.Client()
    bucket = client.get_bucket(bucket)

    for root, dirs, files in os.walk(dep_dir):
        for filename in files:
            blob = bucket.blob(path + filename)
            blob.upload_from_filename(os.path.join(root, filename))


if __name__ == "__main__":
    main()