diff --git a/velox/common/encode/Base32.cpp b/velox/common/encode/Base32.cpp new file mode 100644 index 000000000000..d6cd715b74e4 --- /dev/null +++ b/velox/common/encode/Base32.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/encode/Base32.h" + +#include + +namespace facebook::velox::encoding { + +// Encoding base to be used. +constexpr static int kBase = 32; + +// Constants defining the size in bytes of binary and encoded blocks for Base32 +// encoding. +// Size of a binary block in bytes (5 bytes = 40 bits) +constexpr static int kBinaryBlockByteSize = 5; +// Size of an encoded block in bytes (8 bytes = 40 bits) +constexpr static int kEncodedBlockByteSize = 8; + +constexpr Charset kBase32Charset = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', '2', '3', '4', '5', '6', '7'}; + +constexpr ReverseIndex kBase32ReverseIndexTable = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + +/// Verify that for each 32 entries in kBase32Charset, the corresponding entry +/// in kBase32ReverseIndexTable is correct. +static_assert( + checkForwardIndex( + sizeof(kBase32Charset) / 2 - 1, + kBase32Charset, + kBase32ReverseIndexTable), + "kBase32Charset has incorrect entries"); + +/// Verify that for every entry in kBase32ReverseIndexTable, the corresponding +/// entry in kBase32Charset is correct. +static_assert( + checkReverseIndex( + sizeof(kBase32ReverseIndexTable) - 1, + kBase, + kBase32Charset, + kBase32ReverseIndexTable), + "kBase32ReverseIndexTable has incorrect entries."); + +// static +size_t Base32::calculateEncodedSize(size_t size, bool withPadding) { + if (size == 0) { + return 0; + } + + // Calculate the output size assuming that we are including padding. + size_t encodedSize = ((size + 4) / 5) * 8; + if (!withPadding) { + // If the padding was not requested, subtract the padding bytes. + encodedSize -= (5 - (size % 5)) % 5; + } + return encodedSize; +} + +// static +void Base32::encode(const char* data, size_t len, char* output) { + encodeImpl(folly::StringPiece(data, len), kBase32Charset, true, output); +} + +template +/* static */ void Base32::encodeImpl( + const T& data, + const Charset& charset, + bool include_pad, + char* out) { + auto len = data.size(); + if (len == 0) { + return; + } + + auto wp = out; + auto it = data.begin(); + + auto append_padding = [include_pad](char* str, int n) -> char* { + if (include_pad) { + for (int i = 0; i < n; ++i) { + *str++ = kPadding; + } + } + return str; + }; + + /// For each group of 5 bytes (40 bits) in the input, split that into + /// 8 groups of 5 bits and encode that using the supplied charset lookup. + for (; len > 4; len -= 5) { + uint64_t curr = uint64_t(*it++) << 32; + curr |= uint8_t(*it++) << 24; + curr |= uint8_t(*it++) << 16; + curr |= uint8_t(*it++) << 8; + curr |= uint8_t(*it++); + + *wp++ = charset[(curr >> 35) & 0x1f]; + *wp++ = charset[(curr >> 30) & 0x1f]; + *wp++ = charset[(curr >> 25) & 0x1f]; + *wp++ = charset[(curr >> 20) & 0x1f]; + *wp++ = charset[(curr >> 15) & 0x1f]; + *wp++ = charset[(curr >> 10) & 0x1f]; + *wp++ = charset[(curr >> 5) & 0x1f]; + *wp++ = charset[curr & 0x1f]; + } + + if (len > 0) { + /// We have either 1 to 4 input bytes left. Encode this similar to the + /// above (assuming 0 for all other bytes). Optionally append the '=' + /// character if it is requested. + uint64_t curr = uint64_t(*it++) << 32; + *wp++ = charset[(curr >> 35) & 0x1f]; + + if (len > 3) { + curr |= uint8_t(*it++) << 24; + curr |= uint8_t(*it++) << 16; + curr |= uint8_t(*it++) << 8; + + *wp++ = charset[(curr >> 30) & 0x1f]; + *wp++ = charset[(curr >> 25) & 0x1f]; + *wp++ = charset[(curr >> 20) & 0x1f]; + *wp++ = charset[(curr >> 15) & 0x1f]; + *wp++ = charset[(curr >> 10) & 0x1f]; + *wp++ = charset[(curr >> 5) & 0x1f]; + + append_padding(wp, 1); + } else if (len > 2) { + curr |= uint8_t(*it++) << 24; + curr |= uint8_t(*it++) << 16; + + *wp++ = charset[(curr >> 30) & 0x1f]; + *wp++ = charset[(curr >> 25) & 0x1f]; + *wp++ = charset[(curr >> 20) & 0x1f]; + *wp++ = charset[(curr >> 15) & 0x1f]; + + append_padding(wp, 3); + + } else if (len > 1) { + curr |= uint8_t(*it) << 24; + + *wp++ = charset[(curr >> 30) & 0x1f]; + *wp++ = charset[(curr >> 25) & 0x1f]; + *wp++ = charset[(curr >> 20) & 0x1f]; + + append_padding(wp, 4); + } else { + *wp++ = charset[(curr >> 30) & 0x1f]; + + append_padding(wp, 6); + } + } +} + +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/Base32.h b/velox/common/encode/Base32.h new file mode 100644 index 000000000000..72d223659e03 --- /dev/null +++ b/velox/common/encode/Base32.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include "velox/common/encode/EncoderUtils.h" + +namespace facebook::velox::encoding { + +class Base32 { + public: + /// Returns encoded size for the input of the specified size. + static size_t calculateEncodedSize(size_t size, bool withPadding = true); + + /// Encodes the specified number of characters from the 'data' and writes the + /// result to the 'output'. The output must have enough space, e.g. as + /// returned by the calculateEncodedSize(). + static void encode(const char* data, size_t size, char* output); + + private: + template + static void encodeImpl( + const T& data, + const Charset& charset, + bool include_pad, + char* out); +}; + +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/CMakeLists.txt b/velox/common/encode/CMakeLists.txt index 501c690c476b..d6d5562999ea 100644 --- a/velox/common/encode/CMakeLists.txt +++ b/velox/common/encode/CMakeLists.txt @@ -16,5 +16,7 @@ if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() -velox_add_library(velox_encode Base64.cpp) -velox_link_libraries(velox_encode PUBLIC Folly::folly) +velox_add_library(velox_encode Base32.cpp Base64.cpp) +velox_link_libraries( + velox_encode + PUBLIC Folly::folly) diff --git a/velox/common/encode/tests/Base32Test.cpp b/velox/common/encode/tests/Base32Test.cpp new file mode 100644 index 000000000000..3b2968308225 --- /dev/null +++ b/velox/common/encode/tests/Base32Test.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/encode/Base32.h" +#include +#include "velox/common/base/tests/GTestUtils.h" + +namespace facebook::velox::encoding { + +class Base32Test : public ::testing::Test {}; + +TEST_F(Base32Test, calculateEncodedSizeProperSize) { + EXPECT_EQ(0, Base32::calculateEncodedSize(0, false)); + EXPECT_EQ(4, Base32::calculateEncodedSize(1, false)); + EXPECT_EQ(5, Base32::calculateEncodedSize(2, false)); + EXPECT_EQ(6, Base32::calculateEncodedSize(3, false)); + EXPECT_EQ(7, Base32::calculateEncodedSize(4, false)); + + EXPECT_EQ(0, Base32::calculateEncodedSize(0, true)); + EXPECT_EQ(8, Base32::calculateEncodedSize(1, true)); + EXPECT_EQ(8, Base32::calculateEncodedSize(2, true)); + EXPECT_EQ(8, Base32::calculateEncodedSize(3, true)); + EXPECT_EQ(8, Base32::calculateEncodedSize(4, true)); + + EXPECT_EQ(20, Base32::calculateEncodedSize(11, false)); + EXPECT_EQ(24, Base32::calculateEncodedSize(11, true)); +} + +} // namespace facebook::velox::encoding diff --git a/velox/common/encode/tests/CMakeLists.txt b/velox/common/encode/tests/CMakeLists.txt index 548ce580e469..ca9d489f6e50 100644 --- a/velox/common/encode/tests/CMakeLists.txt +++ b/velox/common/encode/tests/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_executable(velox_common_encode_test Base64Test.cpp EncoderUtilsTests.cpp) +add_executable(velox_common_encode_test Base32Test.cpp Base64Test.cpp EncoderUtilsTests.cpp) add_test(velox_common_encode_test velox_common_encode_test) target_link_libraries( velox_common_encode_test diff --git a/velox/docs/functions/presto/binary.rst b/velox/docs/functions/presto/binary.rst index 8b4ddc26832e..2a92f262309f 100644 --- a/velox/docs/functions/presto/binary.rst +++ b/velox/docs/functions/presto/binary.rst @@ -135,6 +135,24 @@ Binary Functions Encodes ``bigint`` in a 64-bit 2’s complement big endian format. +.. function:: to_base32(varbinary) -> string + + Encodes a binary ``varbinary`` value into its Base32 string representation. + This function generates padded Base32 strings by default. + + Examples + -------- + Query to encode a binary value to a padded Base32 string: + :: + SELECT to_base32(ARRAY[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]); -- 'JBSWY3DPEBLW64TMMQ======' + + Query to encode a binary value with fewer bytes: + :: + SELECT to_base32(ARRAY[104, 101, 108, 108, 111]); -- 'NBSWY3DP' + + In the above examples, the binary array `[72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]` is encoded to the padded Base32 string 'JBSWY3DPEBLW64TMMQ======'. + The binary array `[104, 101, 108, 108, 111]` is encoded to 'NBSWY3DP'. + .. function:: to_hex(binary) -> varchar Encodes ``binary`` into a hex string representation. diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index d733553ce4a9..158ee1d039d5 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -21,6 +21,7 @@ #include "folly/ssl/OpenSSLHash.h" #include "velox/common/base/BitUtil.h" +#include "velox/common/encode/Base32.h" #include "velox/common/encode/Base64.h" #include "velox/external/md5/md5.h" #include "velox/functions/Udf.h" @@ -324,6 +325,18 @@ struct ToBase64UrlFunction { } }; +template +struct ToBase32Function { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& input) { + result.resize(encoding::Base32::calculateEncodedSize(input.size())); + encoding::Base32::encode(input.data(), input.size(), result.data()); + } +}; + template struct FromBigEndian32 { VELOX_DEFINE_FUNCTION_TYPES(T); diff --git a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp index 6f098ebadc51..8722df517e44 100644 --- a/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/BinaryFunctionsRegistration.cpp @@ -55,6 +55,8 @@ void registerSimpleFunctions(const std::string& prefix) { {prefix + "to_base64url"}); registerFunction( {prefix + "from_base64url"}); + registerFunction( + {prefix + "to_base32"}); registerFunction( {prefix + "from_big_endian_32"}); diff --git a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp index 72ef47e22b10..fa83db090d3c 100644 --- a/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/BinaryFunctionsTest.cpp @@ -477,6 +477,27 @@ TEST_F(BinaryFunctionsTest, fromBase64Url) { EXPECT_THROW(fromBase64Url("YQ=/"), VeloxUserError); } +TEST_F(BinaryFunctionsTest, toBase32) { + const auto toBase32 = [&](std::optional value) { + return evaluateOnce("to_base32(cast(c0 as varbinary))", value); + }; + + EXPECT_EQ(std::nullopt, toBase32(std::nullopt)); + EXPECT_EQ("", toBase32("")); + EXPECT_EQ("ME======", toBase32("a")); + EXPECT_EQ("MFRGG===", toBase32("abc")); + EXPECT_EQ("NZXQ====", toBase32("no")); + EXPECT_EQ("O5SQ====", toBase32("we")); + EXPECT_EQ("MRRDE===", toBase32("db2")); + EXPECT_EQ("MNQWWZI=", toBase32("cake")); + EXPECT_EQ("NNSWK3Q=", toBase32("keen")); + EXPECT_EQ("GEZDGNA=", toBase32("1234")); + EXPECT_EQ("NBSWY3DPEB3W64TMMQ======", toBase32("hello world")); + EXPECT_EQ( + "JBSWY3DPEBLW64TMMQQGM4TPNUQFMZLMN54CC===", + toBase32("Hello World from Velox!")); +} + TEST_F(BinaryFunctionsTest, fromBigEndian32) { const auto fromBigEndian32 = [&](const std::optional& arg) { return evaluateOnce("from_big_endian_32(c0)", VARBINARY(), arg);