forked from GoogleCloudPlatform/professional-services
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into jakePR185
- Loading branch information
Showing
41 changed files
with
74,310 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
99 changes: 99 additions & 0 deletions
99
examples/dataflow-data-generator/data-generator-pipeline/data_generator/ParquetUtil.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# Copyright 2018 Google Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import pyarrow as pa | ||
import logging | ||
import datetime | ||
from TimeUtil import datetime_to_epoch_timestamp, date_to_epoch_date, \ | ||
time_to_epoch_time | ||
|
||
|
||
def get_pyarrow_translated_schema(string_schema): | ||
""" | ||
Converts string schema dict to pyarrow schema for writing to parquet. | ||
:param string_schema: | ||
:return: pyarrow schema | ||
""" | ||
type_conversions = { | ||
'STRING': pa.string(), | ||
'NUMERIC': pa.int64(), | ||
'BYTE': None, | ||
'INTEGER': pa.int64(), | ||
'FLOAT': pa.float64(), | ||
'NUMERIC': pa.int64(), | ||
'BOOLEAN': pa.bool_(), | ||
'TIMESTAMP': pa.timestamp('us'), | ||
'DATE': pa.date32(), | ||
'TIME': pa.time64('us'), | ||
'DATETIME': pa.timestamp('us'), | ||
'GEOGRAPHY': None, | ||
'RECORD': None | ||
} | ||
pa_schema_list = [] | ||
for field in string_schema: | ||
field_type = field['type'] | ||
field_name = field['name'] | ||
field_mode = field['mode'] | ||
converted_field_type = type_conversions[field_type] | ||
if converted_field_type is None: | ||
error_message = 'Error: json schema included a {0:s} field. ' \ | ||
'BYTE, GEOGRAPHY, and RECORD types cannot ' \ | ||
'currently be used when outputting to ' \ | ||
'parquet.'.format(field_type) | ||
logging.error(error_message) | ||
raise ValueError(error_message) | ||
else: | ||
nullable = False if field_mode == 'REQUIRED' else True | ||
pa_field = pa.field( | ||
name=field_name, | ||
type=converted_field_type | ||
#nullable=nullable | ||
) | ||
pa_schema_list.append(pa_field) | ||
|
||
return pa.schema(pa_schema_list) | ||
|
||
|
||
def fix_record_for_parquet(record, schema): | ||
""" | ||
Converts TIMESTAMP, DATETIME, DATE, and TIME types to their respective | ||
types for parquet compatibility. | ||
:param record: record of data from beam pipeline | ||
:param schema: string schema dict. | ||
:return: record with converted TIMESTAMP, DATETIME, DATE, and/or TIME | ||
fields. | ||
""" | ||
for field in schema: | ||
field_name = field["name"] | ||
if field["type"] in ("TIMESTAMP", "DATETIME"): | ||
record[field_name] = int(datetime_to_epoch_timestamp( | ||
record[field_name] | ||
)) | ||
elif field["type"] == "DATE": | ||
record[field_name] = int(date_to_epoch_date( | ||
record[field_name] | ||
)) | ||
elif field["type"] == "TIME": | ||
try: | ||
record[field_name] = datetime.datetime.strptime( | ||
record[field_name], | ||
'%H:%M:%S' | ||
).time() | ||
except ValueError: | ||
record[field_name] = datetime.datetime.strptime( | ||
record[field_name], | ||
'%H:%M:%S.%f' | ||
).time() | ||
|
||
return [record] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 97 additions & 0 deletions
97
examples/dataflow-data-generator/data-generator-pipeline/data_generator/TimeUtil.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# Copyright 2018 Google Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import datetime | ||
|
||
|
||
def datetime_to_epoch_timestamp(timestamp, micros=True): | ||
""" | ||
This is a convienence function for converting datetime objects to | ||
timestamps in either milliseconds or microseconds since the Unix | ||
Epoch. | ||
Args: | ||
timestamp: (datetime.datetime) to be converted. | ||
micros: (bool) should we use microsecond precision. Default behavior | ||
is millisecond precision. This should be dictated by the avsc file. | ||
""" | ||
_UNIX_EPOCH = datetime.datetime(1970, 1, 1) | ||
_MILLISECONDS_PER_SECOND = 10 ** 3 | ||
_MICROSECONDS_PER_SECOND = 10 ** 6 | ||
|
||
if isinstance(timestamp, unicode): | ||
try: | ||
timestamp = datetime.datetime.strptime(timestamp, | ||
'%Y-%m-%dT%H:%M:%S') | ||
except ValueError: | ||
timestamp = datetime.datetime.strptime(timestamp, | ||
'%Y-%m-%dT%H:%M:%S.%f') | ||
|
||
seconds_since_epoch = (timestamp - _UNIX_EPOCH).total_seconds() | ||
|
||
multiplier = _MICROSECONDS_PER_SECOND if micros \ | ||
else _MILLISECONDS_PER_SECOND | ||
|
||
return long(seconds_since_epoch * multiplier) | ||
|
||
|
||
def date_to_epoch_date(date): | ||
""" | ||
This is a convienence function for converting datetime objects to | ||
timestamps in either milliseconds or microseconds since the Unix | ||
Epoch. | ||
Args: | ||
date: (datetime.datetime) to be converted. | ||
micros: (bool) should we use microsecond precision. Default behavior | ||
is millisecond precision. This should be dictated by the avsc file. | ||
""" | ||
_UNIX_EPOCH = datetime.datetime(1970, 1, 1) | ||
|
||
if isinstance(date, unicode): | ||
date = datetime.datetime.strptime(date, '%Y-%m-%d') | ||
|
||
days_since_epoch = (date - _UNIX_EPOCH).days | ||
|
||
return int(days_since_epoch) | ||
|
||
|
||
def time_to_epoch_time(time, micros=True): | ||
""" | ||
This is a convienence function for converting datetime objects to | ||
timestamps in either milliseconds or microseconds since the Unix | ||
Epoch. | ||
Args: | ||
time: (datetime.datetime) to be converted. | ||
micros: (bool) should we use microsecond precision. Default behavior | ||
is millisecond precision. This should be dictated by the avsc file. | ||
""" | ||
_MIDNIGHT = datetime.time(0, 0, 0) | ||
_MILLISECONDS_PER_SECOND = 10 ** 3 | ||
_MICROSECONDS_PER_SECOND = 10 ** 6 | ||
if isinstance(time, unicode): | ||
try: | ||
time = datetime.datetime.strptime(time, '%H:%M:%S').time() | ||
except ValueError: | ||
time = datetime.datetime.strptime(time, '%H:%M:%S.%f').time() | ||
|
||
_TODAY = datetime.date.today() | ||
|
||
seconds_since_midnight = (datetime.datetime.combine(_TODAY, time) | ||
- datetime.datetime.combine(_TODAY, | ||
_MIDNIGHT) | ||
).total_seconds() | ||
|
||
multiplier = _MICROSECONDS_PER_SECOND if micros \ | ||
else _MILLISECONDS_PER_SECOND | ||
|
||
return long(seconds_since_midnight * multiplier) |
Oops, something went wrong.