Skip to content

Commit

Permalink
Fix dotted column names in Hive
Browse files Browse the repository at this point in the history
This has the side effect of changing the return value of ResultProxy.keys() and RowProxy.keys(), unless using hive_raw_colnames. After this diff, those methods will return undotted names.
  • Loading branch information
jingw committed Sep 1, 2017
1 parent 75a8337 commit 1ef9226
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 34 deletions.
24 changes: 24 additions & 0 deletions pyhive/sqlalchemy_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,33 @@ def visit_DATETIME(self, type_):
return 'TIMESTAMP'


class HiveExecutionContext(default.DefaultExecutionContext):
"""This is pretty much the same as SQLiteExecutionContext to work around the same issue.
http://docs.sqlalchemy.org/en/latest/dialects/sqlite.html#dotted-column-names
engine = create_engine('hive://...', execution_options={'hive_raw_colnames': True})
"""

@util.memoized_property
def _preserve_raw_colnames(self):
# Ideally, this would also gate on hive.resultset.use.unique.column.names
return self.execution_options.get('hive_raw_colnames', False)

def _translate_colname(self, colname):
# Adjust for dotted column names.
# When hive.resultset.use.unique.column.names is true (the default), Hive returns column
# names as "tablename.colname" in cursor.description.
if not self._preserve_raw_colnames and '.' in colname:
return colname.split('.')[-1], colname
else:
return colname, None


class HiveDialect(default.DefaultDialect):
name = b'hive'
driver = b'thrift'
execution_ctx_cls = HiveExecutionContext
preparer = HiveIdentifierPreparer
statement_compiler = HiveCompiler
supports_views = True
Expand Down
32 changes: 16 additions & 16 deletions pyhive/tests/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,28 +38,28 @@ def connect(self):
def test_description(self, cursor):
cursor.execute('SELECT * FROM one_row')

desc = [('number_of_rows', 'INT_TYPE', None, None, None, None, True)]
desc = [('one_row.number_of_rows', 'INT_TYPE', None, None, None, None, True)]
self.assertEqual(cursor.description, desc)

@with_cursor
def test_complex(self, cursor):
cursor.execute('SELECT * FROM one_row_complex')
self.assertEqual(cursor.description, [
('boolean', 'BOOLEAN_TYPE', None, None, None, None, True),
('tinyint', 'TINYINT_TYPE', None, None, None, None, True),
('smallint', 'SMALLINT_TYPE', None, None, None, None, True),
('int', 'INT_TYPE', None, None, None, None, True),
('bigint', 'BIGINT_TYPE', None, None, None, None, True),
('float', 'FLOAT_TYPE', None, None, None, None, True),
('double', 'DOUBLE_TYPE', None, None, None, None, True),
('string', 'STRING_TYPE', None, None, None, None, True),
('timestamp', 'TIMESTAMP_TYPE', None, None, None, None, True),
('binary', 'BINARY_TYPE', None, None, None, None, True),
('array', 'ARRAY_TYPE', None, None, None, None, True),
('map', 'MAP_TYPE', None, None, None, None, True),
('struct', 'STRUCT_TYPE', None, None, None, None, True),
('union', 'UNION_TYPE', None, None, None, None, True),
('decimal', 'DECIMAL_TYPE', None, None, None, None, True),
('one_row_complex.boolean', 'BOOLEAN_TYPE', None, None, None, None, True),
('one_row_complex.tinyint', 'TINYINT_TYPE', None, None, None, None, True),
('one_row_complex.smallint', 'SMALLINT_TYPE', None, None, None, None, True),
('one_row_complex.int', 'INT_TYPE', None, None, None, None, True),
('one_row_complex.bigint', 'BIGINT_TYPE', None, None, None, None, True),
('one_row_complex.float', 'FLOAT_TYPE', None, None, None, None, True),
('one_row_complex.double', 'DOUBLE_TYPE', None, None, None, None, True),
('one_row_complex.string', 'STRING_TYPE', None, None, None, None, True),
('one_row_complex.timestamp', 'TIMESTAMP_TYPE', None, None, None, None, True),
('one_row_complex.binary', 'BINARY_TYPE', None, None, None, None, True),
('one_row_complex.array', 'ARRAY_TYPE', None, None, None, None, True),
('one_row_complex.map', 'MAP_TYPE', None, None, None, None, True),
('one_row_complex.struct', 'STRUCT_TYPE', None, None, None, None, True),
('one_row_complex.union', 'UNION_TYPE', None, None, None, None, True),
('one_row_complex.decimal', 'DECIMAL_TYPE', None, None, None, None, True),
])
rows = cursor.fetchall()
expected = [(
Expand Down
24 changes: 24 additions & 0 deletions pyhive/tests/test_sqlalchemy_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,30 @@ class TestSqlAlchemyHive(unittest.TestCase, SqlAlchemyTestCase):
def create_engine(self):
return create_engine('hive://localhost:10000/default')

@with_engine_connection
def test_dotted_column_names(self, engine, connection):
"""When Hive returns a dotted column name, both the non-dotted version should be available
as an attribute, and the dotted version should remain available as a key.
"""
row = connection.execute('SELECT * FROM one_row').fetchone()
assert row.keys() == ['number_of_rows']
assert 'number_of_rows' in row
assert row.number_of_rows == 1
assert row['number_of_rows'] == 1
assert getattr(row, 'one_row.number_of_rows') == 1
assert row['one_row.number_of_rows'] == 1

@with_engine_connection
def test_dotted_column_names_raw(self, engine, connection):
"""When Hive returns a dotted column name, and raw mode is on, nothing should be modified.
"""
row = connection.execution_options(hive_raw_colnames=True)\
.execute('SELECT * FROM one_row').fetchone()
assert row.keys() == ['one_row.number_of_rows']
assert 'number_of_rows' not in row
assert getattr(row, 'one_row.number_of_rows') == 1
assert row['one_row.number_of_rows'] == 1

@with_engine_connection
def test_reflect_select(self, engine, connection):
"""reflecttable should be able to fill in a table from the name"""
Expand Down
9 changes: 0 additions & 9 deletions scripts/travis-conf/hive/hive-site-ldap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@
<name>fs.defaultFS</name>
<value>file:///</value>
</property>
<!--
TODO tests rely having result set column names unprefixed
This could be improved by having an option to strip out prefixes when it would not result in
ambiguity.
-->
<property>
<name>hive.resultset.use.unique.column.names</name>
<value>false</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>LDAP</value>
Expand Down
9 changes: 0 additions & 9 deletions scripts/travis-conf/hive/hive-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,4 @@
<name>fs.defaultFS</name>
<value>file:///</value>
</property>
<!--
TODO tests rely having result set column names unprefixed
This could be improved by having an option to strip out prefixes when it would not result in
ambiguity.
-->
<property>
<name>hive.resultset.use.unique.column.names</name>
<value>false</value>
</property>
</configuration>

0 comments on commit 1ef9226

Please sign in to comment.