Skip to content

Commit

Permalink
feat(firestore-bigquery-export): prepare RC (#2206)
Browse files Browse the repository at this point in the history
* chore(firestore-bigquery-changetracker): bump version

* fix(firestore-bigquery-export): added ts-expect-error and TODOs in the import script

* feat: try to immediately write to bq first

* chore: remove legacy backfill code

* feat: add max enqueue attempts param

* test: add flags to test, remove unused resource

* feat: add backup to gcs

* chore(firestore-bigquery-export): temporarily disable GCS

* chore: bump ext version

* fix(firstore-bigquery-export): comment out unused role for now and use logging

* fix(firestore-bigquery-export): implemented RC changes including logging keys

* chore(firestore-bigquery-export): update README and CHANGELOG

* chore(firestore-bigquery-export): update CHANGELOG
  • Loading branch information
cabljac committed Nov 6, 2024
1 parent f7561e5 commit ea44778
Show file tree
Hide file tree
Showing 16 changed files with 673 additions and 292 deletions.
8 changes: 8 additions & 0 deletions _emulator/.firebaserc
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
{
"projects": {
"default": "demo-test"
},
"targets": {},
"etags": {
"dev-extensions-testing": {
"extensionInstances": {
"firestore-bigquery-export": "02acbd8b443b9635716d52d65758a78db1e51140191caecaaf60d932d314a62a"
}
}
}
}
10 changes: 10 additions & 0 deletions firestore-bigquery-export/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## Version 0.1.56

feat - improve sync strategy by immediately writing to BQ, and using cloud tasks only as a last resort

refactor - improve observability/logging of events

chore - remove legacy backfill code

fix - improved usage of the types from change tracker package

## Version 0.1.55

feat - log failed queued tasks
Expand Down
8 changes: 2 additions & 6 deletions firestore-bigquery-export/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,6 @@ To install an extension, your project must be on the [Blaze (pay as you go) plan

* Collection path: What is the path of the collection that you would like to export? You may use `{wildcard}` notation to match a subcollection of all documents in a collection (for example: `chatrooms/{chatid}/posts`). Parent Firestore Document IDs from `{wildcards}` can be returned in `path_params` as a JSON formatted string.

* Enable logging failed exports: If enabled, the extension will log event exports that failed to enqueue to Cloud Logging, to mitigate data loss.

* Enable Wildcard Column field with Parent Firestore Document IDs: If enabled, creates a column containing a JSON object of all wildcard ids from a documents path.

* Dataset ID: What ID would you like to use for your BigQuery dataset? This extension will create the dataset, if it doesn't already exist.
Expand Down Expand Up @@ -158,18 +156,16 @@ essential for the script to insert data into an already partitioned table.)

* Exclude old data payloads: If enabled, table rows will never contain old data (document snapshot before the Firestore onDocumentUpdate event: `change.before.data()`). The reduction in data should be more performant, and avoid potential resource limitations.

* Use Collection Group query: Do you want to use a [collection group](https://firebase.google.com/docs/firestore/query-data/queries#collection-group-query) query for importing existing documents? You have to enable collectionGroup query if your import path contains subcollections. Warning: A collectionGroup query will target every collection in your Firestore project that matches the 'Existing documents collection'. For example, if you have 10,000 documents with a subcollection named: landmarks, this will query every document in 10,000 landmarks collections.

* Cloud KMS key name: Instead of Google managing the key encryption keys that protect your data, you control and manage key encryption keys in Cloud KMS. If this parameter is set, the extension will specify the KMS key name when creating the BQ table. See the PREINSTALL.md for more details.

* Maximum number of enqueue attempts: This parameter will set the maximum number of attempts to enqueue a document to cloud tasks for export to BigQuery. If the maximum number of attempts is reached, the failed export will be handled according to the `LOG_FAILED_EXPORTS` parameter.



**Cloud Functions:**

* **fsexportbigquery:** Listens for document changes in your specified Cloud Firestore collection, then exports the changes into BigQuery.

* **fsimportexistingdocs:** Imports existing documents from the specified collection into BigQuery. Imported documents will have a special changelog with the operation of `IMPORT` and the timestamp of epoch.

* **syncBigQuery:** A task-triggered function that gets called on BigQuery sync

* **initBigQuerySync:** Runs configuration for sycning with BigQuery
Expand Down
131 changes: 36 additions & 95 deletions firestore-bigquery-export/extension.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

name: firestore-bigquery-export
version: 0.1.55
version: 0.1.56
specVersion: v1beta

displayName: Stream Firestore to BigQuery
Expand Down Expand Up @@ -48,6 +48,9 @@ roles:
- role: datastore.user
reason: Allows the extension to write updates to the database.

# - role: storage.objectAdmin
# reason: Allows the extension to create objects in the storage bucket.

resources:
- name: fsexportbigquery
type: firebaseextensions.v1beta.function
Expand All @@ -60,19 +63,6 @@ resources:
eventType: providers/cloud.firestore/eventTypes/document.write
resource: projects/${param:PROJECT_ID}/databases/(default)/documents/${param:COLLECTION_PATH}/{documentId}

- name: fsimportexistingdocs
type: firebaseextensions.v1beta.function
description:
Imports existing documents from the specified collection into BigQuery.
Imported documents will have a special changelog with the operation of
`IMPORT` and the timestamp of epoch.
properties:
runtime: nodejs18
taskQueueTrigger:
retryConfig:
maxAttempts: 15
minBackoffSeconds: 60

- name: syncBigQuery
type: firebaseextensions.v1beta.function
description: >-
Expand Down Expand Up @@ -206,19 +196,6 @@ params:
default: posts
required: true

- param: LOG_FAILED_EXPORTS
label: Enable logging failed exports
description: >-
If enabled, the extension will log event exports that failed to enqueue to
Cloud Logging, to mitigate data loss.
type: select
options:
- label: Yes
value: yes
- label: No
value: no
required: true

- param: WILDCARD_IDS
label: Enable Wildcard Column field with Parent Firestore Document IDs
description: >-
Expand Down Expand Up @@ -409,74 +386,6 @@ params:
- label: No
value: no

# - param: DO_BACKFILL
# label: Import existing Firestore documents into BigQuery?
# description: >-
# Do you want to import existing documents from your Firestore collection
# into BigQuery? These documents will have each have a special changelog
# with the operation of `IMPORT` and the timestamp of epoch. This ensures
# that any operation on an imported document supersedes the import record.
# type: select
# required: true
# default: no
# options:
# - label: Yes
# value: yes
# - label: No
# value: no

# - param: IMPORT_COLLECTION_PATH
# label: Existing Documents Collection
# description: >-
# Specify the path of the Cloud Firestore Collection you would like to
# import from. This may or may not be the same Collection for which you plan
# to mirror changes. If you want to use a collectionGroup query, provide the
# collection name value here, and set 'Use Collection Group query' to true.
# You may use `{wildcard}` notation with an enabled collectionGroup query to
# match a subcollection of all documents in a collection (e.g.,
# `chatrooms/{chatid}/posts`).
# type: string
# validationRegex: "^[^/]+(/[^/]+/[^/]+)*$"
# validationErrorMessage:
# Firestore collection paths must be an odd number of segments separated by
# slashes, e.g. "path/to/collection".
# example: posts
# required: false

- param: USE_COLLECTION_GROUP_QUERY
label: Use Collection Group query
description: >-
Do you want to use a [collection
group](https://firebase.google.com/docs/firestore/query-data/queries#collection-group-query)
query for importing existing documents? You have to enable collectionGroup
query if your import path contains subcollections. Warning: A
collectionGroup query will target every collection in your Firestore
project that matches the 'Existing documents collection'. For example, if
you have 10,000 documents with a subcollection named: landmarks, this will
query every document in 10,000 landmarks collections.
type: select
default: no
options:
- label: Yes
value: yes
- label: No
value: no

# - param: DOCS_PER_BACKFILL
# label: Docs per backfill
# description: >-
# When importing existing documents, how many should be imported at once?
# The default value of 200 should be ok for most users. If you are using a
# transform function or have very large documents, you may need to set this
# to a lower number. If the lifecycle event function times out, lower this
# value.
# type: string
# example: 200
# validationRegex: "^[1-9][0-9]*$"
# validationErrorMessage: Must be a postive integer.
# default: 200
# required: true

- param: KMS_KEY_NAME
label: Cloud KMS key name
description: >-
Expand All @@ -491,6 +400,38 @@ params:
'projects/PROJECT_NAME/locations/KEY_RING_LOCATION/keyRings/KEY_RING_ID/cryptoKeys/KEY_ID'.
required: false

- param: MAX_ENQUEUE_ATTEMPTS
label: Maximum number of enqueue attempts
description: >-
This parameter will set the maximum number of attempts to enqueue a
document to cloud tasks for export to BigQuery. If the maximum number of
attempts is reached, the failed export will be handled according to the
`LOG_FAILED_EXPORTS` parameter.
type: string
validationRegex: ^(10|[1-9])$
validationErrorMessage: Please select an integer between 1 and 10
default: 3

# - param: BACKUP_TO_GCS
# label: Backup to GCS
# description: >-
# If enabled, failed BigQuery updates will be written to a GCS bucket.
# type: select
# options:
# - label: Yes
# value: yes
# - label: No
# value: no
# default: no
# required: true

# - param: BACKUP_GCS_BUCKET
# label: Backup GCS Bucket Name
# description: >-
# This (optional) parameter will allow you to specify a GCS bucket for which
# failed BigQuery updates will be written to, if this feature is enabled.
# type: string

events:
- type: firebase.extensions.firestore-counter.v1.onStart
description:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

exports[`extension config config loaded from environment variables 1`] = `
Object {
"backupBucketName": "undefined.appspot.com",
"backupCollectionId": undefined,
"backupDir": "_firestore-bigquery-export",
"backupToGCS": false,
"bqProjectId": undefined,
"clustering": Array [
"data",
Expand All @@ -12,23 +15,20 @@ Object {
"databaseId": "(default)",
"datasetId": "my_dataset",
"datasetLocation": undefined,
"doBackfill": false,
"docsPerBackfill": 200,
"excludeOldData": false,
"importCollectionPath": undefined,
"initialized": false,
"instanceId": undefined,
"kmsKeyName": "test",
"location": "us-central1",
"logFailedExportData": false,
"maxDispatchesPerSecond": 10,
"maxEnqueueAttempts": 3,
"tableId": "my_table",
"timePartitioning": null,
"timePartitioningField": undefined,
"timePartitioningFieldType": undefined,
"timePartitioningFirestoreField": undefined,
"transformFunction": "",
"useCollectionGroupQuery": false,
"useNewSnapshotQuerySyntax": false,
"wildcardIds": false,
}
Expand Down
8 changes: 4 additions & 4 deletions firestore-bigquery-export/functions/__tests__/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ import * as admin from "firebase-admin";
import { BigQuery } from "@google-cloud/bigquery";

/** Set defaults */
const bqProjectId = "dev-extensions-testing";
const datasetId = "firestore_export";
const tableId = "bq_e2e_test_raw_changelog";
const bqProjectId = process.env.BQ_PROJECT_ID || "dev-extensions-testing";
const datasetId = process.env.DATASET_ID || "firestore_export";
const tableId = process.env.TABLE_ID || "bq_e2e_test_raw_changelog";

/** Init resources */
admin.initializeApp({ projectId: bqProjectId });
Expand Down Expand Up @@ -34,7 +34,7 @@ describe("e2e", () => {

/** Get the latest record from this table */
const [changeLogQuery] = await bq.createQueryJob({
query: `SELECT * FROM \`${bqProjectId}.${datasetId}.${tableId}\` ORDER BY timestamp DESC \ LIMIT 1`,
query: `SELECT * FROM \`${bqProjectId}.${datasetId}.${tableId}\` ORDER BY timestamp DESC LIMIT 1`,
});

const [rows] = await changeLogQuery.getQueryResults();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jest.mock("firebase-admin/functions", () => ({
}));

jest.mock("../src/logs", () => ({
...jest.requireActual("../src/logs"),
start: jest.fn(() =>
logger.log("Started execution of extension with configuration", config)
),
Expand Down
8 changes: 4 additions & 4 deletions firestore-bigquery-export/functions/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions firestore-bigquery-export/functions/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,31 @@
"author": "Jan Wyszynski <wyszynski@google.com>",
"license": "Apache-2.0",
"dependencies": {
"@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.37",
"@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.38",
"@google-cloud/bigquery": "^7.6.0",
"@types/chai": "^4.1.6",
"@types/express-serve-static-core": "4.17.30",
"@types/jest": "29.5.0",
"@types/node": "^20.4.4",
"chai": "^4.2.0",
"firebase-admin": "^12.0.0",
"firebase-functions": "^4.9.0",
"firebase-functions-test": "^0.3.3",
"generate-schema": "^2.6.0",
"inquirer": "^6.4.0",
"jest": "29.5.0",
"jest-config": "29.5.0",
"lodash": "^4.17.14",
"nyc": "^14.0.0",
"rimraf": "^2.6.3",
"sql-formatter": "^2.3.3",
"ts-jest": "29.1.2",
"ts-node": "^9.0.0",
"typescript": "^4.8.4",
"@types/jest": "29.5.0",
"jest": "29.5.0",
"jest-config": "29.5.0",
"ts-jest": "29.1.2"
"typescript": "^4.8.4"
},
"private": true,
"devDependencies": {
"mocked-env": "^1.3.2",
"faker": "^5.1.0"
"faker": "^5.1.0",
"mocked-env": "^1.3.2"
}
}
12 changes: 8 additions & 4 deletions firestore-bigquery-export/functions/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,10 @@ export function clustering(clusters: string | undefined) {
}

export default {
logFailedExportData: process.env.LOG_FAILED_EXPORTS === "yes",
bqProjectId: process.env.BIGQUERY_PROJECT_ID,
databaseId: "(default)",
collectionPath: process.env.COLLECTION_PATH,
datasetId: process.env.DATASET_ID,
doBackfill: process.env.DO_BACKFILL === "yes",
docsPerBackfill: parseInt(process.env.DOCS_PER_BACKFILL) || 200,
tableId: process.env.TABLE_ID,
location: process.env.LOCATION,
initialized: false,
Expand All @@ -63,5 +60,12 @@ export default {
process.env.MAX_DISPATCHES_PER_SECOND || "10"
),
kmsKeyName: process.env.KMS_KEY_NAME,
useCollectionGroupQuery: process.env.USE_COLLECTION_GROUP_QUERY === "yes",
maxEnqueueAttempts: isNaN(parseInt(process.env.MAX_ENQUEUE_ATTEMPTS))
? 3
: parseInt(process.env.MAX_ENQUEUE_ATTEMPTS),
// backup bucket defaults to default firebase cloud storage bucket
backupToGCS: process.env.BACKUP_TO_GCS === "yes" ? true : false,
backupBucketName:
process.env.BACKUP_GCS_BUCKET || `${process.env.PROJECT_ID}.appspot.com`,
backupDir: `_${process.env.INSTANCE_ID || "firestore-bigquery-export"}`,
};
Loading

0 comments on commit ea44778

Please sign in to comment.