script/publish_persistent_udfs

#!/usr/bin/env python3

"""
This script publishes all user-defined functions in a directory as persistent
UDFs in the {dirname} dataset.

The {dirname}_ prefix will be stripped from names of published UDFs.
"""

from argparse import ArgumentParser
import os
import sys
import re

from google.cloud import bigquery
from gcloud import storage

# sys.path needs to be modified to enable package imports from parent
# and sibling directories. Also see:
# https://stackoverflow.com/questions/6323860/sibling-package-imports/23542795#23542795
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from bigquery_etl.parse_udf import read_udf_dirs, accumulate_dependencies  # noqa E402


UDF_RE = re.compile(r"(udf_js|udf|legacy)(?:\.|_)([a-zA-z0-9_]+)")

OPTIONS_LIB_RE = re.compile(r'library = "gs://[^"]+/([^"]+)"')

SKIP = {"udf/main_summary_scalars.sql"}

parser = ArgumentParser(description=__doc__)
parser.add_argument(
    "--project-id", default="moz-fx-data-derived-datasets", help="The project ID."
)
parser.add_argument(
    "--udf-dirs",
    nargs="+",
    default=["udf/", "udf_js/"],
    help="The directories where declarations of UDFs are stored.",
)
parser.add_argument(
    "--gcs-bucket",
    default="moz-fx-data-prod-bigquery-etl",
    help="The GCS bucket where dependency files are uploaded to.",
)
parser.add_argument(
    "--gcs-path",
    default="",
    help="The GCS path in the bucket where dependency files are uploaded to.",
)
parser.add_argument(
    "--dep-dir",
    default="udf_js/lib/",
    help="The directory JavaScript dependency files for UDFs are stored.",
)


def main():
    args = parser.parse_args()

    raw_udfs = read_udf_dirs(*args.udf_dirs)
    published_udfs = []
    client = bigquery.Client(args.project_id)

    if args.dep_dir:
        push_dependencies_to_gcs(
            args.gcs_bucket, args.gcs_path, args.dep_dir, args.project_id
        )

    for raw_udf in raw_udfs:
        # get all dependencies for UDF and publish as persistent UDF
        udfs_to_publish = accumulate_dependencies([], raw_udfs, raw_udf)

        udfs_to_publish.append(raw_udf)
        for dep in udfs_to_publish:
            if dep not in published_udfs and raw_udfs[dep].filepath not in SKIP:
                publish_persistent_udf(
                    raw_udfs[dep],
                    client,
                    args.project_id,
                    args.gcs_bucket,
                    args.gcs_path,
                )
                published_udfs.append(dep)


def publish_persistent_udf(raw_udf, client, project_id, gcs_bucket, gcs_path):
    # transforms temporary UDF to persistent UDFs and publishes them
    for definition in raw_udf.definitions:
        # Within a standard SQL function, references to other entities require
        # explicit project IDs
        query_with_renamed_udfs = UDF_RE.sub(
            "`" + project_id + "`." + r"\1" + "." + r"\2", definition
        )

        # adjust paths for dependencies stored in GCS
        query = OPTIONS_LIB_RE.sub(
            fr'library = "gs://{gcs_bucket}/{gcs_path}\1"', query_with_renamed_udfs
        )

        client.query(query).result()


def push_dependencies_to_gcs(bucket, path, dep_dir, project_id):
    client = storage.Client(project_id)
    bucket = client.get_bucket(bucket)

    for root, dirs, files in os.walk(dep_dir):
        for filename in files:
            blob = bucket.blob(path + filename)
            blob.upload_from_filename(os.path.join(root, filename))


if __name__ == "__main__":
    main()