forked from GoogleCloudPlatform/professional-services
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add bigquery table access pattern analysis (GoogleCloudPlatform…
…#676) * feat: add bigquery table access pattern analysis * fix: README formatting * fix: dummy table example in README * feat: add Apache license headers * feat: add apache license to JS files * feat: add blank dir and file for template * fix: definition of a pipeline Co-authored-by: Agnes Natasya <agnesnatasya@google.com>
- Loading branch information
1 parent
def3751
commit c945f2a
Showing
20 changed files
with
1,842 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
419 changes: 419 additions & 0 deletions
419
examples/bigquery-table-access-pattern-analysis/README.md
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+78.9 KB
examples/bigquery-table-access-pattern-analysis/assets/architecture.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+112 KB
examples/bigquery-table-access-pattern-analysis/assets/pipeline-definition.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+13.7 MB
examples/bigquery-table-access-pattern-analysis/assets/pipeline-example.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+199 KB
examples/bigquery-table-access-pattern-analysis/assets/subquery-definition.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+173 KB
examples/bigquery-table-access-pattern-analysis/assets/subquery-standardised.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
130 changes: 130 additions & 0 deletions
130
examples/bigquery-table-access-pattern-analysis/bq_routines/getPipelineTypeAndSchedule.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
// Copyright 2021 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
Array.prototype.median = function () { | ||
return this.slice().sort((a, b) => a - b)[Math.floor(this.length / 2)]; | ||
}; | ||
|
||
var toHoursUnit = function(time) { | ||
return time/1000/60/60; | ||
} | ||
|
||
var toDayUnit = function(time) { | ||
return time/1000/60/60/24; | ||
} | ||
|
||
function getPipelineTypeAndSchedule(runHistory) { | ||
if (runHistory.length < 3) { | ||
return { | ||
pipelineType: 'AD HOC', | ||
schedule: 'NON DETERMINISTICALLY' | ||
}; | ||
} | ||
|
||
// Sort the array | ||
runHistory.sort((a,b) => new Date(a).getTime() - new Date(b).getTime()); | ||
|
||
// Get the time differences between the schedule | ||
var timeDifferences = runHistory.map(function(time, index) { | ||
if (time != runHistory[0]) { | ||
return (time - runHistory[index -1]); | ||
} | ||
}) | ||
.filter(x => x); | ||
|
||
// Get the time differences between the schedule in hours | ||
var timeDifferencesInHour = timeDifferences.map(timeDifference => toHoursUnit(timeDifference)); | ||
|
||
// Set the schedule if it is more than serveral times every hour | ||
var schedule = null; | ||
var timeDifferencesInHoursMedian = timeDifferencesInHour.median(); | ||
if (timeDifferencesInHoursMedian <= 0.5) { | ||
var elementsFollowingPattern = timeDifferencesInHour.filter(x => x <= 0.5); | ||
if (elementsFollowingPattern.length > 0.5 * timeDifferencesInHour.length && elementsFollowingPattern.length > 20) { | ||
schedule = 'SEVERAL TIMES EVERY HOUR'; | ||
} | ||
} | ||
|
||
// Set the schedule if it is hourly | ||
if (timeDifferencesInHoursMedian > 0.5 && timeDifferencesInHoursMedian <= 1.5) { | ||
var elementsFollowingPattern = timeDifferencesInHour.filter(x => x > 0.5 && x <= 1.5) | ||
if (elementsFollowingPattern.length > 0.5 * timeDifferencesInHour.length && elementsFollowingPattern.length > 3) { | ||
schedule = 'HOURLY'; | ||
} | ||
} | ||
|
||
// Set the schedule if it is daily | ||
if (timeDifferencesInHoursMedian >= 23 && timeDifferencesInHoursMedian <= 25) { | ||
var elementsFollowingPattern = timeDifferencesInHour.filter(x => x <= 23 && x >= 25) | ||
if (elementsFollowingPattern.length > 0.5 * timeDifferencesInHour.length && elementsFollowingPattern.length > 3) { | ||
schedule = 'DAILY'; | ||
} | ||
} | ||
|
||
// Set the schedule if it is monthly | ||
var timeDifferencesInDays = timeDifferences.map(timeDifference => timeDifference/1000/60/60/24); | ||
var timeDifferencesInDaysMedian = timeDifferencesInDays.median(); | ||
if (timeDifferencesInDaysMedian >= 27 && timeDifferencesInDaysMedian <= 32) { | ||
var elementsFollowingPattern = timeDifferencesInDays.filter(x => x >=27 && x <= 32); | ||
if (elementsFollowingPattern.length > 0.5 * timeDifferencesInDays.length && elementsFollowingPattern.length > 2) { | ||
schedule = 'MONTHLY'; | ||
} | ||
} | ||
|
||
if (schedule == null) { | ||
return { | ||
pipelineType: 'AD HOC', | ||
schedule: 'NON DETERMINISTICALLY' | ||
}; | ||
} | ||
|
||
var pipelineType = null; | ||
var timeGapFromNowToLastRun = new Date() - runHistory[runHistory.length - 1]; | ||
if (schedule == 'SEVERAL TIMES EVERY HOUR') { | ||
if (toHoursUnit(timeGapFromNowToLastRun) > 1.5) { | ||
pipelineType = 'DEAD'; | ||
} else { | ||
pipelineType = 'LIVE'; | ||
} | ||
} | ||
|
||
if (schedule == 'HOURLY') { | ||
if (toHoursUnit(timeGapFromNowToLastRun) > 3) { | ||
pipelineType = 'DEAD'; | ||
} else { | ||
pipelineType = 'LIVE'; | ||
} | ||
} | ||
|
||
if (schedule == 'DAILY') { | ||
if (toDayUnit(timeGapFromNowToLastRun) > 3) { | ||
pipelineType = 'DEAD'; | ||
} else { | ||
pipelineType = 'LIVE'; | ||
} | ||
} | ||
|
||
if (schedule == 'MONTHLY') { | ||
if (toDayUnit(timeGapFromNowToLastRun) > 50) { | ||
pipelineType = 'DEAD'; | ||
} else { | ||
pipelineType = 'LIVE'; | ||
} | ||
} | ||
|
||
return { | ||
pipelineType: pipelineType, | ||
schedule: schedule | ||
} | ||
} |
53 changes: 53 additions & 0 deletions
53
.../bigquery-table-access-pattern-analysis/bq_routines/getTablesInvolvedInPipelineOfTable.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
// Copyright 2021 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
function getDirectDestinations(table, allSourceDestTablePairs) { | ||
return allSourceDestTablePairs.filter(sourceDestTablePair => sourceDestTablePair.sourceTable == table).map(sourceDestTablePair => sourceDestTablePair.destinationTable).filter(x => x); | ||
} | ||
|
||
function getDirectSources(table, allSourceDestTablePairs) { | ||
return allSourceDestTablePairs.filter(sourceDestTablePair => sourceDestTablePair.destinationTable == table).map(sourceDestTablePair => sourceDestTablePair.sourceTable).filter(x => x); | ||
} | ||
|
||
function getTablesInvolvedInPipelineOfTable(table, allSourceDestTablePairs) { | ||
var tablesInvolved = [table]; | ||
var visitedDestinations = []; | ||
var tablesToBeExplored = [table]; | ||
while (tablesToBeExplored.length != 0) { | ||
var currentTable = tablesToBeExplored.pop(); | ||
var directDestinations = getDirectDestinations(currentTable, allSourceDestTablePairs); | ||
visitedDestinations.push(currentTable); | ||
for (const directDestination of directDestinations) { | ||
if (!visitedDestinations.includes(directDestination) && directDestination != currentTable) { | ||
tablesInvolved.push(directDestination); | ||
tablesToBeExplored.push(directDestination); | ||
} | ||
} | ||
} | ||
|
||
var visitedSources = []; | ||
tablesToBeExplored = [table]; | ||
while (tablesToBeExplored.length != 0) { | ||
var currentTable = tablesToBeExplored.pop(); | ||
var directSources = getDirectSources(currentTable, allSourceDestTablePairs); | ||
visitedSources.push(currentTable); | ||
for (const directSource of directSources) { | ||
if (!visitedSources.includes(directSource) && directSource != currentTable) { | ||
tablesInvolved.push(directSource); | ||
tablesToBeExplored.push(directSource); | ||
} | ||
} | ||
} | ||
return tablesInvolved.filter(x => x); | ||
} |
193 changes: 193 additions & 0 deletions
193
examples/bigquery-table-access-pattern-analysis/pipeline-output_only.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"### License\n", | ||
"Copyright 2021 Google LLC\n", | ||
"\n", | ||
"Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n", | ||
"\n", | ||
" http://www.apache.org/licenses/LICENSE-2.0\n", | ||
"\n", | ||
"Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." | ||
], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Analysis Result\n", | ||
"\n", | ||
"### Get the tables with highest discrepancy on write vs read frequency throughout the data warehouse\n", | ||
"This will list down tables with the highest discrepancy on write vs read frequency." | ||
], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"source": [ | ||
"import src.pipeline_analysis as pipeline_analysis\n", | ||
"import ipywidgets as widgets\n", | ||
"from IPython.display import display\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"limited_imbalance_tables = []\n", | ||
"def get_limited_imbalance_tables_df(limit):\n", | ||
" global limited_imbalance_tables\n", | ||
" limited_imbalance_tables_df = pipeline_analysis.get_tables_read_write_frequency_df(limit)\n", | ||
" limited_imbalance_tables = limited_imbalance_tables_df['Table'].tolist()\n", | ||
" return limited_imbalance_tables_df\n", | ||
"\n", | ||
"limited_imbalance_tables_df = get_limited_imbalance_tables_df(5)\n", | ||
"display(limited_imbalance_tables_df)" | ||
], | ||
"outputs": [ | ||
{ | ||
"output_type": "display_data", | ||
"data": { | ||
"text/plain": [ | ||
" Table Read Frequency \\\n", | ||
"0 data-analytics-pocs.atos.temp_updates 0 \n", | ||
"1 data-analytics-pocs.atos.max_timestamp_temp 0 \n", | ||
"2 data-analytics-pocs.atos.identity_4m_new 0 \n", | ||
"3 data-analytics-pocs._c4108c8641cb849fa23d12f19... 0 \n", | ||
"4 data-analytics-pocs.public.bigquery_audit_log 0 \n", | ||
"\n", | ||
" Write Frequency \n", | ||
"0 12613 \n", | ||
"1 8131 \n", | ||
"2 7950 \n", | ||
"3 2057 \n", | ||
"4 1698 " | ||
], | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>Table</th>\n", | ||
" <th>Read Frequency</th>\n", | ||
" <th>Write Frequency</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>data-analytics-pocs.atos.temp_updates</td>\n", | ||
" <td>0</td>\n", | ||
" <td>12613</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>data-analytics-pocs.atos.max_timestamp_temp</td>\n", | ||
" <td>0</td>\n", | ||
" <td>8131</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>data-analytics-pocs.atos.identity_4m_new</td>\n", | ||
" <td>0</td>\n", | ||
" <td>7950</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>data-analytics-pocs._c4108c8641cb849fa23d12f19...</td>\n", | ||
" <td>0</td>\n", | ||
" <td>2057</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>data-analytics-pocs.public.bigquery_audit_log</td>\n", | ||
" <td>0</td>\n", | ||
" <td>1698</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
] | ||
}, | ||
"metadata": {} | ||
} | ||
], | ||
"metadata": { | ||
"scrolled": true | ||
} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"#### Get the pipeline graph data of the table\n", | ||
"This will generate a pipeline graph file, in HTML format, under `pipeline_graph` directory. It may take sometime for this to run and generate." | ||
], | ||
"metadata": {} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"source": [ | ||
"def visualise_table_pipelines(table):\n", | ||
" pipeline_analysis.display_pipelines_of_table(table)\n", | ||
"\n", | ||
"visualise_table_pipelines(\"data-analytics-pocs.public.bigquery_audit_log\")" | ||
], | ||
"outputs": [], | ||
"metadata": { | ||
"scrolled": true | ||
} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"#### Display the pipeline graph of the table\n", | ||
"Display the pipeline graph of the table. The thickness of the edges indicates the frequency compared to the rest of the edges in the current graph.\n", | ||
"\n", | ||
"The result will be something like this, when you run the notebook, you will be able to click on the different nodes of the graph, each representing different tbales that are part of the pipeline of this table of interest. When you click on a node, it will display more information for this table.\n", | ||
"\n", | ||
"" | ||
], | ||
"metadata": {} | ||
} | ||
], | ||
"metadata": { | ||
"environment": { | ||
"name": "common-cpu.m76", | ||
"type": "gcloud", | ||
"uri": "gcr.io/deeplearning-platform-release/base-cpu:m76" | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.