fedspendingtransparency · zachflanders-frb · Dec 18, 2025 · May 1, 2025 · May 1, 2025 · May 29, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -10,17 +10,25 @@ FROM python:3.10.12-slim-bullseye
 
 WORKDIR /dockermount
 
+##### Install postgres 16
+RUN apt-get update && apt-get install -y wget gnupg lsb-release \
+ && wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - \
+ && echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" \
+      > /etc/apt/sources.list.d/pgdg.list \
+ && apt-get update \
+ && apt-get install -y postgresql-16
+
+
 RUN apt update && apt install -y \
     curl \
     gcc \
-    libpq-dev \
-    postgresql-13
+    libpq-dev
 
-##### Copy python packaged
 COPY . /dockermount
+
 RUN python3 -m pip install -r requirements/requirements.txt && \
     python3 -m pip install -r requirements/requirements-server.txt && \
     python3 -m pip install ansible==2.9.15 awscli==1.34.19
 
-##### Ensure Python STDOUT gets sent to container logs
+# Ensure Python STDOUT gets sent to container logs
 ENV PYTHONUNBUFFERED=1
diff --git a/Dockerfile.duckdb b/Dockerfile.duckdb
@@ -0,0 +1,12 @@
+# Dockerfile for downloads using DuckDB
+
+FROM usaspending-backend:latest
+
+ENV DUCKDB_VERSION=1.4.3
+
+# Install DuckDB extensions
+RUN mkdir -p /root/.duckdb/extensions/v$DUCKDB_VERSION/linux_amd64 && \
+    curl http://extensions.duckdb.org/v$DUCKDB_VERSION/linux_amd64/delta.duckdb_extension.gz | gunzip > /root/.duckdb/extensions/v$DUCKDB_VERSION/linux_amd64/delta.duckdb_extension && \
+    curl http://extensions.duckdb.org/v$DUCKDB_VERSION/linux_amd64/aws.duckdb_extension.gz | gunzip > /root/.duckdb/extensions/v$DUCKDB_VERSION/linux_amd64/aws.duckdb_extension && \
+    curl http://extensions.duckdb.org/v$DUCKDB_VERSION/linux_amd64/httpfs.duckdb_extension.gz | gunzip > /root/.duckdb/extensions/v$DUCKDB_VERSION/linux_amd64/httpfs.duckdb_extension && \
+    curl http://extensions.duckdb.org/v$DUCKDB_VERSION/linux_amd64/postgres_scanner.duckdb_extension.gz | gunzip > /root/.duckdb/extensions/v$DUCKDB_VERSION/linux_amd64/postgres_scanner.duckdb_extension
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -167,7 +167,7 @@ services:
       - usaspending  # must pass --profile usaspending to docker compose for this to come up
       - test
       - ci
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.1.1
+    image: opensearchproject/opensearch:2.9.0
     container_name: usaspending-es
     environment:
       - node.name=usaspending-es
@@ -177,35 +177,36 @@ services:
       - network.host=0.0.0.0
       - transport.host=localhost
       - bootstrap.memory_lock=true
-      - "ES_JAVA_OPTS=-Xms2048m -Xmx2048m"  # Ensure Docker is allocated plenty of memory, otherwise this will fail
+      - plugins.security.disabled=true
+      - "OPENSEARCH_JAVA_OPTS=-Xms2g -Xmx2g"  # Ensure Docker is allocated plenty of memory, otherwise this will fail
     # Inject plugin install, then resume with orignial entrypoint command
     command: >
       /bin/sh -c "
-        if [ ! -d /usr/share/elasticsearch/plugins/mapper-murmur3 ]; then
-          # Certificate problem workaround when on VPN - wget without checking cert, then install from local filesystem
-          wget --no-check-certificate https://artifacts.elastic.co/downloads/elasticsearch-plugins/mapper-murmur3/mapper-murmur3-7.1.1.zip
-          ./bin/elasticsearch-plugin install file:///usr/share/elasticsearch/mapper-murmur3-7.1.1.zip
+        if [ ! -d /usr/share/opensearch/plugins/mapper-murmur3 ]; then
+          /usr/share/opensearch/bin/opensearch-plugin install mapper-murmur3
         fi
-        /usr/local/bin/docker-entrypoint.sh"
+        /usr/share/opensearch/opensearch-docker-entrypoint.sh"
     ulimits:
       memlock:
         soft: -1
         hard: -1
     volumes:
     - type: volume
       source: local_es_data
-      target: /usr/share/elasticsearch/data
+      target: /usr/share/opensearch/data
     ports:
       - 9200:9200
 
   usaspending-kibana-es:
     profiles:
       - usaspending  # must pass --profile usaspending to docker compose for this to come up
-    image: docker.elastic.co/kibana/kibana-oss:7.1.1
+    image: opensearchproject/opensearch-dashboards:2.9.0
     container_name: usaspending-kibana-es
     # ELASTICSEARCH_HOSTS should match the port for "usaspending-es"; value will need to be updated if using Windows
     environment:
-      - ELASTICSEARCH_HOSTS="http://docker.for.mac.localhost:9200"
+      - OPENSEARCH_HOSTS="http://usaspending-es:9200"
+      - DISABLE_SECURITY_DASHBOARDS_PLUGIN=true
+      - DISABLE_SECURITY_PLUGIN=true
     ports:
       - 5601:5601
 
@@ -351,6 +352,7 @@ services:
   #    make docker-compose-run profiles="--profile spark" args="--rm -e MINIO_HOST=minio -e JDBC_URL -e COMPONENT_NAME='My Spark Prototype Script' spark-submit \
   #      --packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
   #      /project/usaspending_api/etl/tests/path_to_your_spark_prototype_script.py"
+
   spark-submit:
     profiles:
       - spark  # must pass --profile spark to docker compose for this to come up

diff --git a/requirements/requirements-app.txt b/requirements/requirements-app.txt
@@ -16,6 +16,7 @@ djangorestframework==3.15.*
 docutils==0.20.1
 drf-api-tracking==1.8.4
 drf-extensions==0.7.*
+duckdb==1.4.3  # Pinned because DuckDB extensions have to be manually installed for each specific version
 elasticsearch-dsl==7.4.*
 elasticsearch==7.10.*
 et-xmlfile==1.1.0

diff --git a/usaspending_api/api_contracts/contracts/v2/references/total_budgetary_resources.md b/usaspending_api/api_contracts/contracts/v2/references/total_budgetary_resources.md
@@ -14,7 +14,7 @@ This endpoint returns federal budgetary resources by fiscal year and fiscal peri
     + `fiscal_year` (optional, number)
         The fiscal year to retrieve, 2017 or later.
     + `fiscal_period` (optional, number)
-        The fiscal period. If this optional parameter is provided then `fiscal_year` is a required parameter. If `fiscal_period` is provided without `fiscal_year`, a 400 error is returned.  Valid values: 2-12 (2 = November ... 12 = September). For retrieving quarterly data, provide the period which equals 'quarter * 3' (e.g. Q2 = P6). If neither paramater is provided, the entire available history will be returned.
+        The fiscal period. If this optional parameter is provided then `fiscal_year` is a required parameter. If `fiscal_period` is provided without `fiscal_year`, a 400 error is returned.  Valid values: 2-12 (2 = November ... 12 = September). For retrieving quarterly data, provide the period which equals 'quarter * 3' (e.g. Q2 = P6). If neither parameter is provided, the entire available history will be returned.
 
 + Response 200 (application/json)
 

diff --git a/usaspending_api/common/etl/spark.py b/usaspending_api/common/etl/spark.py
@@ -7,11 +7,16 @@
 
 import logging
 import math
+import os
+import shutil
 import time
 from collections import namedtuple
 from itertools import chain
 from typing import List
 
+import duckdb
+from duckdb.experimental.spark.sql import SparkSession as DuckDBSparkSession
+from duckdb.experimental.spark.sql.dataframe import DataFrame as DuckDBDataFrame
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import col, concat, concat_ws, expr, lit, regexp_replace, to_date, transform, when
 from pyspark.sql.types import ArrayType, DecimalType, StringType, StructType
@@ -48,6 +53,7 @@
     ZipsGrouped,
 )
 from usaspending_api.reporting.models import ReportingAgencyMissingTas, ReportingAgencyOverview
+from usaspending_api.settings import CSV_LOCAL_PATH, IS_LOCAL, USASPENDING_AWS_REGION
 from usaspending_api.submissions.models import DABSSubmissionWindowSchedule, SubmissionAttributes
 
 MAX_PARTITIONS = CONFIG.SPARK_MAX_PARTITIONS
@@ -555,31 +561,123 @@ def _generate_global_view_sql_strings(tables: List[str], jdbc_url: str) -> List[
     return sql_strings
 
 
-def create_ref_temp_views(spark: SparkSession, create_broker_views: bool = False):
+def create_ref_temp_views(spark: SparkSession | DuckDBSparkSession, create_broker_views: bool = False):
     """Create global temporary Spark reference views that sit atop remote PostgreSQL RDS tables
     Setting create_broker_views to True will create views for all tables list in _BROKER_REF_TABLES
     Note: They will all be listed under global_temp.{table_name}
+
+    Args:
+        spark (SparkSession | DuckDBSparkSession): Spark session
+        create_broker_views (bool): Should the temporary views, using the Broker tables, be created
+            Default: False
     """
 
     # Create USAS temp views
     rds_ref_tables = build_ref_table_name_list()
-    rds_sql_strings = _generate_global_view_sql_strings(
-        tables=rds_ref_tables,
-        jdbc_url=get_usas_jdbc_url(),
-    )
     logger.info(f"Creating the following tables under the global_temp database: {rds_ref_tables}")
-    for sql_statement in rds_sql_strings:
-        spark.sql(sql_statement)
-
-    # Create Broker temp views
-    if create_broker_views:
-        broker_sql_strings = _generate_global_view_sql_strings(
-            tables=_BROKER_REF_TABLES,
-            jdbc_url=get_broker_jdbc_url(),
-        )
-        logger.info(f"Creating the following Broker tables under the global_temp database: {_BROKER_REF_TABLES}")
-        for sql_statement in broker_sql_strings:
-            spark.sql(sql_statement)
+
+    match isinstance(spark, DuckDBSparkSession):
+        case True:
+            logger.info("Creating ref temp views using DuckDB")
+
+            if IS_LOCAL:
+                spark.sql(
+                    f"""
+                    CREATE OR REPLACE SECRET (
+                        TYPE s3,
+                        PROVIDER config,
+                        KEY_ID '{CONFIG.AWS_ACCESS_KEY}',
+                        SECRET '{CONFIG.AWS_SECRET_KEY}',
+                        ENDPOINT '{CONFIG.AWS_S3_ENDPOINT}',
+                        URL_STYLE 'path',
+                        USE_SSL 'false'
+                    );
+                """
+                )
+            else:
+                # DuckDB will prepend the HTTP or HTTPS so we need to strip it from the AWS endpoint URL
+                endpoint_url = CONFIG.AWS_S3_ENDPOINT.replace("http://", "").replace("https://", "")
+                spark.sql(
+                    f"""
+                    CREATE OR REPLACE SECRET (
+                        TYPE s3,
+                        REGION '{USASPENDING_AWS_REGION}',
+                        ENDPOINT '{endpoint_url}',
+                        PROVIDER 'credential_chain'
+                    );
+                """
+                )
+
+            _download_delta_tables = [
+                {"schema": "rpt", "table_name": "account_balances_download"},
+                {"schema": "rpt", "table_name": "object_class_program_activity_download"},
+            ]
+
+            # The DuckDB Delta extension is needed to interact with DeltaLake tables
+            spark.sql("LOAD delta; CREATE SCHEMA IF NOT EXISTS rpt;")
+            for table in _download_delta_tables:
+                s3_path = (
+                    f"s3://{CONFIG.SPARK_S3_BUCKET}/{CONFIG.DELTA_LAKE_S3_PATH}/{table['schema']}/{table['table_name']}"
+                )
+                try:
+                    spark.sql(
+                        f"""
+                        CREATE OR REPLACE TABLE {table["schema"]}.{table["table_name"]} AS
+                        SELECT * FROM delta_scan('{s3_path}');
+                    """
+                    )
+                    logger.info(f"Successfully created table {table['schema']}.{table['table_name']}")
+                except duckdb.IOException:
+                    logger.exception(f"Failed to create table {table['table_name']}")
+                    raise RuntimeError(f"Failed to create table {table['table_name']}")
+
+            # The DuckDB Postgres extension is needed to connect to the USAS Postgres DB
+            spark.sql("LOAD postgres; CREATE SCHEMA IF NOT EXISTS global_temp;")
+            spark.sql(f"ATTACH '{CONFIG.DATABASE_URL}' AS usas (TYPE postgres, READ_ONLY);")
+
+            for table in rds_ref_tables:
+                try:
+                    spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM usas.public.{table};")
+                except duckdb.CatalogException:
+                    logger.exception(f"Failed to create view {table} for {table}")
+                    raise RuntimeError(f"Failed to create view {table} for {table}")
+
+            if create_broker_views:
+                spark.sql(
+                    f"""
+                    ATTACH '{CONFIG.BROKER_DB}' AS broker (TYPE postgres, READ_ONLY);
+                """
+                )
+                logger.info(
+                    f"Creating the following Broker tables under the global_temp database: {_BROKER_REF_TABLES}"
+                )
+                for table in _BROKER_REF_TABLES:
+                    try:
+                        spark.sql(f"CREATE OR REPLACE VIEW global_temp.{table} AS SELECT * FROM broker.public.{table};")
+                    except duckdb.CatalogException:
+                        logger.exception(f"Failed to create view {table} for {table}")
+                        raise RuntimeError(f"Failed to create view {table} for {table}")
+        case False:
+            logger.info("Creating ref temp views using Spark")
+
+            rds_sql_strings = _generate_global_view_sql_strings(
+                tables=rds_ref_tables,
+                jdbc_url=get_usas_jdbc_url(),
+            )
+
+            for sql_statement in rds_sql_strings:
+                spark.sql(sql_statement)
+
+            if create_broker_views:
+                broker_sql_strings = _generate_global_view_sql_strings(
+                    tables=_BROKER_REF_TABLES,
+                    jdbc_url=get_broker_jdbc_url(),
+                )
+                logger.info(
+                    f"Creating the following Broker tables under the global_temp database: {_BROKER_REF_TABLES}"
+                )
+                for sql_statement in broker_sql_strings:
+                    spark.sql(sql_statement)
 
     logger.info("Created the reference views in the global_temp database")
 
@@ -595,9 +693,10 @@ def write_csv_file(
 ) -> int:
     """Write DataFrame data to CSV file parts.
     Args:
-        spark: passed-in active SparkSession
-        df: the DataFrame wrapping the data source to be dumped to CSV.
-            parts_dir: Path to dir that will contain the outputted parts files from partitions
+        spark: Passed-in active SparkSession
+        df: The DataFrame wrapping the data source to be dumped to CSV.
+        parts_dir: Path to dir that will contain the outputted parts files from partitions
+        num_partitions: Indicates the number of partitions to use when writing the Dataframe
         overwrite: Whether to replace the file CSV files if they already exist by that name
         max_records_per_file: Suggestion to Spark of how many records to put in each written CSV file part,
             if it will end up writing multiple files.
@@ -635,6 +734,68 @@ def write_csv_file(
     return df_record_count
 
 
+def write_csv_file_duckdb(
+    df: DuckDBDataFrame,
+    download_file_name: str,
+    temp_csv_directory_path: str = CSV_LOCAL_PATH,
+    max_records_per_file: int = EXCEL_ROW_LIMIT,
+    logger: logging.Logger | None = None,
+    delimiter: str = ",",
+) -> tuple[int, list[str] | list]:
+    """Write DataFrame data to CSV file parts.
+    Args:
+        df: The DataFrame wrapping the data source to be dumped to CSV.
+        download_file_name: Name of the download being generated.
+        temp_csv_directory_path: Directory that will contain the individual CSV files before zipping.
+            Defaults to CSV_LOCAL_PATH
+        max_records_per_file: Max number of records to put in each written CSV file.
+            Defaults to EXCEL_ROW_LIMIT
+        logger: Logging instance to use.
+            Defaults to None
+        delimiter: Charactor used to separate columns in the CSV
+            Defaults to ","
+    Returns:
+        record count of the DataFrame that was used to populate the CSV file(s)
+        list of full path(s) to the temp CSV file(s)
+    """
+    start = time.time()
+    _pandas_df = df.toPandas()
+    _pandas_df["file_number"] = (_pandas_df.index // max_records_per_file) + 1
+    df_record_count = len(_pandas_df)
+    rel = duckdb.from_df(_pandas_df)
+
+    full_file_paths = []
+
+    logger.info(f"Writing source data DataFrame to csv files for file {download_file_name}")
+    rel.to_csv(
+        file_name=f"{temp_csv_directory_path}{download_file_name}",
+        sep=delimiter,
+        escapechar='"',
+        header=True,
+        partition_by=["file_number"],
+        write_partition_columns=False,  # Don't include the columns that are used for partitioning in the CSV
+        overwrite=True,
+    )
+
+    # Move and rename the CSV files to match the expected format
+    _partition_dirs = [
+        f"{temp_csv_directory_path}{download_file_name}/{d}"
+        for d in os.listdir(f"{temp_csv_directory_path}{download_file_name}")
+    ]
+    for dir in _partition_dirs:
+        _old_csv_path = f"{dir}/{os.listdir(dir)[0]}"
+        _new_csv_path = (
+            f"{temp_csv_directory_path}{download_file_name}/{download_file_name}_{dir.split('=')[1].zfill(2)}.csv"
+        )
+        shutil.move(_old_csv_path, _new_csv_path)
+        full_file_paths.append(_new_csv_path)
+        os.rmdir(dir)
+
+    logger.info(f"{temp_csv_directory_path}{download_file_name} contains {df_record_count:,} rows of data")
+    logger.info(f"Wrote source data DataFrame to {len(full_file_paths)} CSV files in {(time.time() - start):3f}s")
+    return df_record_count, full_file_paths
+
+
 def _merge_file_parts(fs, out_stream, conf, hadoop, partial_merged_file_path, part_file_list):
     """Read-in files in alphabetical order and append them one by one to the merged file"""