forked from great-expectations/great_expectations
-
Notifications
You must be signed in to change notification settings - Fork 0
/
azure-pipelines-expectation-gallery.yml
283 lines (248 loc) · 12.7 KB
/
azure-pipelines-expectation-gallery.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# This file is responsible for configuring the `expectation_gallery` pipeline (https://dev.azure.com/great-expectations/great_expectations/_build)
#
# The pipeline is run under the following conditions:
# - On the develop branch when a weekly release is being cut
# - On the develop branch as scheduled by the below cron job
#
# The sole purpose of this pipeline is to build and publish the Expectation gallery. As such, it is designed to run quickly and frequently
# to ensure that the gallery is kept to-up-date with change.
schedules:
- cron: 0 7 * * *
displayName: Scheduled Runs
branches:
include:
- develop
always: false # Will only trigger if the state of the codebase has changed sinced the last scheduled run
resources:
containers:
- container: postgres
image: postgres:11
ports:
- 5432:5432
env:
POSTGRES_DB: "test_ci"
POSTGRES_HOST_AUTH_METHOD: "trust"
- container: mysql
image: mysql:8.0.20
ports:
- 3306:3306
env:
MYSQL_ALLOW_EMPTY_PASSWORD: "yes"
MYSQL_DATABASE: test_ci
- container: mssql
image: mcr.microsoft.com/mssql/server:2019-latest
env:
ACCEPT_EULA: Y
MSSQL_SA_PASSWORD: ReallyStrongPwd1234%^&*
MSSQL_DB: test_ci
MSSQL_PID: Developer
ports:
- 1433:1433
- container: trino
image: trinodb/trino:400
ports:
- 8088:8080
# The pipeline is run under two primary conditions: if cutting a release or as scheduled by the above cron job.
variables:
isRelease: $[startsWith(variables['Build.SourceBranch'], 'refs/tags/')]
isScheduled: $[and(eq(variables['Build.SourceBranch'], 'refs/heads/develop'), eq(variables['Build.Reason'], 'Schedule'))]
isManual: $[eq(variables['Build.Reason'], 'Manual')]
stages:
- stage: scope_check
pool:
vmImage: 'ubuntu-20.04'
jobs:
- job: changes
steps:
- task: ChangedFiles@1
name: CheckChanges
inputs:
verbose: true
rules: |
[ContribChanged]
contrib/**
[GEChanged]
great_expectations/**/*.py
pyproject.toml
setup.cfg
tests/**
[ScriptsChanged]
assets/scripts/**
- stage: deploy_gallery_staging
dependsOn: scope_check
condition: or(eq(variables.isScheduled, true), eq(variables.isRelease, true), eq(variables.isManual, true))
pool:
vmImage: 'ubuntu-20.04'
jobs:
- job: build_gallery
timeoutInMinutes: 420
variables:
python.version: '3.8'
GE_pytest_opts: ''
services:
postgres: postgres
mysql: mysql
mssql: mssql
trino: trino
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
displayName: 'Use Python $(python.version)'
- bash: python -m pip install --upgrade pip==21.3.1
displayName: 'Update pip'
- script: |
pip install \
"google-cloud-bigquery-storage" \
"cryptography==38.0.4" \
--requirement reqs/requirements-dev-all-contrib-expectations.txt \
--editable "contrib/cli" \
--editable ".[dev]" \
--constraint constraints-dev.txt
displayName: 'Install dependencies'
- script: |
printf 'Waiting for MySQL database to accept connections'
until mysql --host=localhost --protocol=TCP --port=3306 --user=root --password='' --execute "SHOW DATABASES"; do
printf '.'
sleep 1;
done;
displayName: 'Wait for MySQL database to initialise'
- script: |
echo "SET GLOBAL sql_mode=(SELECT REPLACE(@@sql_mode,'ONLY_FULL_GROUP_BY',''));" > mysql_setup_script.sql
mysql --host=localhost --protocol=TCP --port=3306 --user=root --password='' --reconnect < mysql_setup_script.sql
displayName: 'Configure mysql'
- script: |
sqlcmd -U sa -P "ReallyStrongPwd1234%^&*" -Q "CREATE DATABASE test_ci;" -o create_db_output.txt
displayName: 'Configure mssql'
- script: |
printf 'Waiting for Trino database to accept connections'
sleep 30
# until trino --execute "SHOW CATALOGS"; do
# printf '.'
# sleep 1;
# done;
displayName: 'Wait for Trino database to initialise'
- task: DownloadSecureFile@1
name: gcp_authkey
displayName: 'Download Google Service Account'
inputs:
secureFile: 'superconductive-service-acct_ge-oss-ci-cd.json'
retryCount: '2'
- bash: python ./build_gallery.py --backends "pandas" --outfile-name "quick_check.json" ; exp_count=$(grep "^ *\"expect_" quick_check.json | wc -l); package_summary=$(grep "\"package\":" quick_check.json | sort | uniq -c | sort -nr) ; touch gallery-tracebacks.txt; cat gallery-tracebacks.txt ; echo -e "\n\n$exp_count Expectations in quick_check.json\n"; echo -e "$package_summary"; [[ $exp_count -lt 297 ]] && echo "Fewer than 297 Expectations" && exit 1 ; [[ $(echo -e "$package_summary" | wc -l) -lt 6 ]] && echo "Fewer than 6 packages" && exit 1 ; echo "ok"
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Build Gallery Quick Check'
- bash: python ./build_gallery.py --outfile-name "expectation_library_v2--staging.json" 2>&1 | tee output--build_gallery.txt ; grep -o "Took .* seconds to .*" output--build_gallery.txt | sort -k2,2nr > testing-times.txt ; grep -o "ERROR - (.*" output--build_gallery.txt | sort > testing-error-messages.txt; touch gallery-tracebacks.txt; grep -o "Expectation type.*" output--build_gallery.txt | sort > gallery-exp-types.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Build Gallery'
env:
# snowflake credentials
SNOWFLAKE_ACCOUNT: $(SNOWFLAKE_ACCOUNT)
SNOWFLAKE_USER: $(SNOWFLAKE_USER)
SNOWFLAKE_PW: $(SNOWFLAKE_PW)
SNOWFLAKE_DATABASE: $(SNOWFLAKE_DATABASE)
SNOWFLAKE_SCHEMA: $(SNOWFLAKE_SCHEMA)
SNOWFLAKE_WAREHOUSE: $(SNOWFLAKE_WAREHOUSE)
SNOWFLAKE_ROLE: $(SNOWFLAKE_ROLE)
# redshift credentials
REDSHIFT_USERNAME: $(REDSHIFT_USERNAME)
REDSHIFT_PASSWORD: $(REDSHIFT_PASSWORD)
REDSHIFT_HOST: $(REDSHIFT_HOST)
REDSHIFT_PORT: $(REDSHIFT_PORT)
REDSHIFT_DATABASE: $(REDSHIFT_DATABASE)
REDSHIFT_SSLMODE: $(REDSHIFT_SSLMODE)
# AWS credentials
AWS_ACCESS_KEY_ID: $(AWS_ACCESS_KEY_ID)
AWS_SECRET_ACCESS_KEY: $(AWS_SECRET_ACCESS_KEY)
AWS_DEFAULT_REGION: $(AWS_DEFAULT_REGION)
ATHENA_DB_NAME: $(ATHENA_DB_NAME)
ATHENA_STAGING_S3: $(ATHENA_STAGING_S3)
ATHENA_DATA_BUCKET: $(ATHENA_DATA_BUCKET)
ATHENA_TEN_TRIPS_DB_NAME: $(ATHENA_TEN_TRIPS_DB_NAME)
# GCP credentials
GOOGLE_APPLICATION_CREDENTIALS: $(gcp_authkey.secureFilePath)
GE_TEST_GCP_PROJECT: $(GE_TEST_GCP_PROJECT)
GE_TEST_BIGQUERY_DATASET: $(GE_TEST_BIGQUERY_DATASET)
# Azure credentials
AZURE_CREDENTIAL: $(AZURE_CREDENTIAL)
AZURE_ACCESS_KEY: $(AZURE_ACCESS_KEY)
- bash: grep -o "coverage_score:.*" output--build_gallery.txt | sort -k2,2nr
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show coverage scores'
- bash: cut -d " " -f 3,4 gallery-exp-types.txt | uniq -c | sort -nr; echo; cut -d " " -f 3,4,6 gallery-exp-types.txt | sort
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show Expectation types and counts'
- bash: grep -o "Implemented engines.*" output--build_gallery.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show implemented engines'
- bash: cat checklists.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show full checklist summary'
- bash: grep -E "(^expect|Completeness checklist|^ *[A-z]|^ *-|-----)" checklists.txt | grep -vE '(No validate_configuration|Using default validate_configuration|Has a full suite|Has passed a manual)'
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show checklist issues'
- bash: cat testing-error-messages.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show testing errors'
- bash: grep --color -n -i warning -B 2 output--build_gallery.txt || echo "No warnings found"
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show testing warnings'
- bash: cat gallery-tracebacks.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show gallery tracebacks'
- bash: grep "to df.to_sql" testing-times.txt || echo "No df.to_sql calls were made"
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show DataFrame to SQL times'
- bash: grep "to run" testing-times.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show testing times grouped by backend and Expectation'
- bash: grep "to evaluate_json" testing-times.txt
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Show testing times for individual tests'
- bash: python ./build_package_gallery.py
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Build Package Gallery'
- task: S3Upload@1
inputs:
regionName: 'us-east-2'
awsCredentials: 'aws-ci-great-expectations'
bucketName: 'superconductive-public'
sourceFolder: '$(Build.SourcesDirectory)/assets/scripts'
globExpressions: '*.json'
targetFolder: 'static/gallery/'
filesAcl: 'public-read'
- task: NodeTool@0
inputs:
versionSpec: '16.16'
- bash: bash ./trigger_algolia.sh
workingDirectory: $(Build.SourcesDirectory)/assets/scripts/
displayName: 'Update Algolia indexes from S3'
env:
# algolia credentials
ALGOLIA_ACCOUNT: $(ALGOLIA_ACCOUNT)
ALGOLIA_EXPECTATION_INDEX: $(ALGOLIA_EXPECTATION_INDEX)
ALGOLIA_PACKAGE_EXPEC_INDEX: $(ALGOLIA_PACKAGE_EXPEC_INDEX)
ALGOLIA_PACKAGE_INDEX: $(ALGOLIA_PACKAGE_INDEX)
ALGOLIA_WRITE_KEY: $(ALGOLIA_WRITE_KEY)
# Build Gallery Paths
ALGOLIA_S3_PACKAGES_URL: $(ALGOLIA_S3_PACKAGES_URL)
ALGOLIA_S3_EXPECTATIONS_URL: $(ALGOLIA_S3_EXPECTATIONS_URL)
# replica indices from expectations for sorting
ALGOLIA_EXPEC_REPLICA_ALPHA_ASC_INDEX: $(ALGOLIA_EXPEC_REPLICA_ALPHA_ASC_INDEX)
ALGOLIA_EXPEC_REPLICA_ALPHA_DSC_INDEX: $(ALGOLIA_EXPEC_REPLICA_ALPHA_DSC_INDEX)
ALGOLIA_EXPEC_REPLICA_COVERAGE_ASC_INDEX: $(ALGOLIA_EXPEC_REPLICA_COVERAGE_ASC_INDEX)
ALGOLIA_EXPEC_REPLICA_COVERAGE_DSC_INDEX: $(ALGOLIA_EXPEC_REPLICA_COVERAGE_DSC_INDEX)
ALGOLIA_EXPEC_REPLICA_CREATED_ASC_INDEX: $(ALGOLIA_EXPEC_REPLICA_CREATED_ASC_INDEX)
ALGOLIA_EXPEC_REPLICA_CREATED_DSC_INDEX: $(ALGOLIA_EXPEC_REPLICA_CREATED_DSC_INDEX)
ALGOLIA_EXPEC_REPLICA_UPDATED_ASC_INDEX: $(ALGOLIA_EXPEC_REPLICA_UPDATED_ASC_INDEX)
ALGOLIA_EXPEC_REPLICA_UPDATED_DSC_INDEX: $(ALGOLIA_EXPEC_REPLICA_UPDATED_DSC_INDEX)
# replica indices from package expectations for sorting
ALGOLIA_PACK_EXPEC_REPLICA_ALPHA_ASC_INDEX: $(ALGOLIA_PACK_EXPEC_REPLICA_ALPHA_ASC_INDEX)
ALGOLIA_PACK_EXPEC_REPLICA_ALPHA_DSC_INDEX: $(ALGOLIA_PACK_EXPEC_REPLICA_ALPHA_DSC_INDEX)
ALGOLIA_PACK_EXPEC_REPLICA_COVERAGE_ASC_INDEX: $(ALGOLIA_PACK_EXPEC_REPLICA_COVERAGE_ASC_INDEX)
ALGOLIA_PACK_EXPEC_REPLICA_COVERAGE_DSC_INDEX: $(ALGOLIA_PACK_EXPEC_REPLICA_COVERAGE_DSC_INDEX)
# - bash: |
# echo "About to trigger webhook: $GALLERY_BUILD_HOOK"
# curl -X POST -d {} $GALLERY_BUILD_HOOK
# displayName: 'Trigger gallery build'
# env:
# GALLERY_BUILD_HOOK: $(gallerywebhook)