Skip to content

Commit 7974a1e

Browse files
authored
HBASE-11676 Scan FORMATTER is not applied for columns using non-printable name in shell (#2161)
- In HBase::Table, the instance variable @converters is used to map column names to converters. This patch fixes how HBase::Table#_get_internal and HBase::Table#_scan_internal generate the column name key used to access @converters. - Refactor parsing of family:qualifier:converter specifications so that the code is more readable and reusable. As part of this change, I added two private methods and marked HBase::Table#set_converter as deprecated for removal in HBase 4.0.0. - Add unit testing for the fixed bug Signed-off-by: stack <stack@apache.org>
1 parent 4471a64 commit 7974a1e

File tree

2 files changed

+93
-24
lines changed

2 files changed

+93
-24
lines changed

hbase-shell/src/main/ruby/hbase/table.rb

Lines changed: 71 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -449,18 +449,23 @@ def _get_internal(row, *args)
449449
# Print out results. Result can be Cell or RowResult.
450450
res = {}
451451
result.listCells.each do |c|
452-
family = convert_bytes_with_position(c.getFamilyArray,
453-
c.getFamilyOffset, c.getFamilyLength, converter_class, converter)
454-
qualifier = convert_bytes_with_position(c.getQualifierArray,
455-
c.getQualifierOffset, c.getQualifierLength, converter_class, converter)
452+
# Get the family and qualifier of the cell without escaping non-printable characters. It is crucial that
453+
# column is constructed in this consistent way to that it can be used as a key.
454+
family_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getFamilyArray, c.getFamilyOffset, c.getFamilyLength)
455+
qualifier_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getQualifierArray, c.getQualifierOffset, c.getQualifierLength)
456+
column = "#{family_bytes}:#{qualifier_bytes}"
456457

457-
column = "#{family}:#{qualifier}"
458458
value = to_string(column, c, maxlength, converter_class, converter)
459459

460+
# Use the FORMATTER to determine how column is printed
461+
family = convert_bytes(family_bytes, converter_class, converter)
462+
qualifier = convert_bytes(qualifier_bytes, converter_class, converter)
463+
formatted_column = "#{family}:#{qualifier}"
464+
460465
if block_given?
461-
yield(column, value)
466+
yield(formatted_column, value)
462467
else
463-
res[column] = value
468+
res[formatted_column] = value
464469
end
465470
end
466471

@@ -604,19 +609,24 @@ def _scan_internal(args = {}, scan = nil)
604609
is_stale |= row.isStale
605610

606611
row.listCells.each do |c|
607-
family = convert_bytes_with_position(c.getFamilyArray,
608-
c.getFamilyOffset, c.getFamilyLength, converter_class, converter)
609-
qualifier = convert_bytes_with_position(c.getQualifierArray,
610-
c.getQualifierOffset, c.getQualifierLength, converter_class, converter)
612+
# Get the family and qualifier of the cell without escaping non-printable characters. It is crucial that
613+
# column is constructed in this consistent way to that it can be used as a key.
614+
family_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getFamilyArray, c.getFamilyOffset, c.getFamilyLength)
615+
qualifier_bytes = org.apache.hadoop.hbase.util.Bytes.copy(c.getQualifierArray, c.getQualifierOffset, c.getQualifierLength)
616+
column = "#{family_bytes}:#{qualifier_bytes}"
611617

612-
column = "#{family}:#{qualifier}"
613618
cell = to_string(column, c, maxlength, converter_class, converter)
614619

620+
# Use the FORMATTER to determine how column is printed
621+
family = convert_bytes(family_bytes, converter_class, converter)
622+
qualifier = convert_bytes(qualifier_bytes, converter_class, converter)
623+
formatted_column = "#{family}:#{qualifier}"
624+
615625
if block_given?
616-
yield(key, "column=#{column}, #{cell}")
626+
yield(key, "column=#{formatted_column}, #{cell}")
617627
else
618628
res[key] ||= {}
619-
res[key][column] = cell
629+
res[key][formatted_column] = cell
620630
end
621631
end
622632
# One more row processed
@@ -729,11 +739,15 @@ def is_meta_table?
729739
org.apache.hadoop.hbase.TableName::META_TABLE_NAME.equals(@table.getName)
730740
end
731741

732-
# Returns family and (when has it) qualifier for a column name
742+
# Given a column specification in the format FAMILY[:QUALIFIER[:CONVERTER]]
743+
# 1. Save the converter for the given column
744+
# 2. Return a 2-element Array with [family, qualifier or nil], discarding the converter if provided
745+
#
746+
# @param [String] column specification
733747
def parse_column_name(column)
734-
split = org.apache.hadoop.hbase.CellUtil.parseColumn(column.to_java_bytes)
735-
set_converter(split) if split.length > 1
736-
[split[0], split.length > 1 ? split[1] : nil]
748+
spec = parse_column_format_spec(column)
749+
set_column_converter(spec.family, spec.qualifier, spec.converter) unless spec.converter.nil?
750+
[spec.family, spec.qualifier]
737751
end
738752

739753
def toISO8601(millis)
@@ -806,9 +820,46 @@ def convert_bytes_with_position(bytes, offset, len, converter_class, converter_m
806820
eval(converter_class).method(converter_method).call(bytes, offset, len)
807821
end
808822

823+
# store the information designating what part of a column should be printed, and how
824+
ColumnFormatSpec = Struct.new(:family, :qualifier, :converter)
825+
826+
##
827+
# Parse the column specification for formatting used by shell commands like :scan
828+
#
829+
# Strings should be structured as follows:
830+
# FAMILY:QUALIFIER[:CONVERTER]
831+
# Where:
832+
# - FAMILY is the column family
833+
# - QUALIFIER is the column qualifier. Non-printable characters should be left AS-IS and should NOT BE escaped.
834+
# - CONVERTER is optional and is the name of a converter (like toLong) to apply
835+
#
836+
# @param [String] column
837+
# @return [ColumnFormatSpec] family, qualifier, and converter as Java bytes
838+
private def parse_column_format_spec(column)
839+
split = org.apache.hadoop.hbase.CellUtil.parseColumn(column.to_java_bytes)
840+
family = split[0]
841+
qualifier = nil
842+
converter = nil
843+
if split.length > 1
844+
parts = org.apache.hadoop.hbase.CellUtil.parseColumn(split[1])
845+
qualifier = parts[0]
846+
if parts.length > 1
847+
converter = parts[1]
848+
end
849+
end
850+
851+
ColumnFormatSpec.new(family, qualifier, converter)
852+
end
853+
854+
private def set_column_converter(family, qualifier, converter)
855+
@converters["#{String.from_java_bytes(family)}:#{String.from_java_bytes(qualifier)}"] = String.from_java_bytes(converter)
856+
end
857+
809858
# if the column spec contains CONVERTER information, to get rid of :CONVERTER info from column pair.
810859
# 1. return back normal column pair as usual, i.e., "cf:qualifier[:CONVERTER]" to "cf" and "qualifier" only
811860
# 2. register the CONVERTER information based on column spec - "cf:qualifier"
861+
#
862+
# Deprecated for removal in 4.0.0
812863
def set_converter(column)
813864
family = String.from_java_bytes(column[0])
814865
parts = org.apache.hadoop.hbase.CellUtil.parseColumn(column[1])
@@ -817,6 +868,8 @@ def set_converter(column)
817868
column[1] = parts[0]
818869
end
819870
end
871+
extend Gem::Deprecate
872+
deprecate :set_converter, "4.0.0", nil, nil
820873

821874
#----------------------------------------------------------------------------------------------
822875
# Get the split points for the table

hbase-shell/src/test/ruby/hbase/table_test.rb

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def setup
239239
@test_ts = 12345678
240240
@test_table.put(1, "x:a", 1)
241241
@test_table.put(1, "x:b", 2, @test_ts)
242+
@test_table.put(1, "x:\x11", [921].pack("N"))
242243

243244
@test_table.put(2, "x:a", 11)
244245
@test_table.put(2, "x:b", 12, @test_ts)
@@ -333,9 +334,10 @@ def teardown
333334
end
334335

335336
define_test "get should work with hash columns spec and an array of strings COLUMN parameter" do
336-
res = @test_table._get_internal('1', COLUMN => [ 'x:a', 'x:b' ])
337+
res = @test_table._get_internal('1', COLUMN => [ "x:\x11", 'x:a', 'x:b' ])
337338
assert_not_nil(res)
338339
assert_kind_of(Hash, res)
340+
assert_not_nil(res['x:\x11'])
339341
assert_not_nil(res['x:a'])
340342
assert_not_nil(res['x:b'])
341343
end
@@ -356,6 +358,18 @@ def teardown
356358
assert_not_nil(res['x:b'])
357359
end
358360

361+
define_test "get should work with non-printable columns and values" do
362+
res = @test_table._get_internal('1', COLUMNS => [ "x:\x11" ])
363+
assert_not_nil(res)
364+
assert_kind_of(Hash, res)
365+
assert_match(/value=\\x00\\x00\\x03\\x99/, res[ 'x:\x11' ])
366+
367+
res = @test_table._get_internal('1', COLUMNS => [ "x:\x11:toInt" ])
368+
assert_not_nil(res)
369+
assert_kind_of(Hash, res)
370+
assert_match(/value=921/, res[ 'x:\x11' ])
371+
end
372+
359373
define_test "get should work with hash columns spec and TIMESTAMP only" do
360374
res = @test_table._get_internal('1', TIMESTAMP => @test_ts)
361375
assert_not_nil(res)
@@ -412,10 +426,10 @@ def teardown
412426
assert_not_nil(res['x:b'])
413427
end
414428

415-
define_test "get with a block should yield (column, value) pairs" do
429+
define_test "get with a block should yield (formatted column, value) pairs" do
416430
res = {}
417431
@test_table._get_internal('1') { |col, val| res[col] = val }
418-
assert_equal(res.keys.sort, [ 'x:a', 'x:b' ])
432+
assert_equal([ 'x:\x11', 'x:a', 'x:b' ], res.keys.sort)
419433
end
420434

421435
define_test "get should support COLUMNS with value CONVERTER information" do
@@ -709,12 +723,14 @@ def teardown
709723
define_test "scan should support COLUMNS with value CONVERTER information" do
710724
@test_table.put(1, "x:c", [1024].pack('N'))
711725
@test_table.put(1, "x:d", [98].pack('N'))
726+
@test_table.put(1, "x:\x11", [712].pack('N'))
712727
begin
713-
res = @test_table._scan_internal COLUMNS => ['x:c:toInt', 'x:d:c(org.apache.hadoop.hbase.util.Bytes).toInt']
728+
res = @test_table._scan_internal COLUMNS => ['x:c:toInt', 'x:d:c(org.apache.hadoop.hbase.util.Bytes).toInt', "x:\x11:toInt"]
714729
assert_not_nil(res)
715730
assert_kind_of(Hash, res)
716-
assert_not_nil(/value=1024/.match(res['1']['x:c']))
717-
assert_not_nil(/value=98/.match(res['1']['x:d']))
731+
assert_match(/value=1024/, res['1']['x:c'])
732+
assert_match(/value=98/, res['1']['x:d'])
733+
assert_match(/value=712/, res['1']['x:\x11'])
718734
ensure
719735
# clean up newly added columns for this test only.
720736
@test_table.deleteall(1, 'x:c')

0 commit comments

Comments
 (0)