Skip to content

Commit 7668c98

Browse files
xiaoxmengfacebook-github-bot
authored andcommitted
fix: Back out "[velox][PR] feat: Optimize deserialize UnsafeRows to RowVector" (facebookincubator#12978)
Summary: Pull Request resolved: facebookincubator#12978 Original commit changeset: 329fefe9edc1 Original Phabricator Diff: D71754296 Reviewed By: Yuhta Differential Revision: D72728731 fbshipit-source-id: d76756b84b00a663c118588f0327b5bfdab636ef
1 parent 059e1fd commit 7668c98

File tree

8 files changed

+1012
-774
lines changed

8 files changed

+1012
-774
lines changed

velox/row/UnsafeRowDeserializers.h

Lines changed: 807 additions & 11 deletions
Large diffs are not rendered by default.

velox/row/UnsafeRowFast.cpp

Lines changed: 0 additions & 689 deletions
Large diffs are not rendered by default.

velox/row/UnsafeRowFast.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,6 @@ class UnsafeRowFast {
4343
/// 'buffer' must have sufficient capacity and set to all zeros.
4444
int32_t serialize(vector_size_t index, char* buffer) const;
4545

46-
/// Deserializes multiple rows into a RowVector of specified type. The type
47-
/// must match the contents of the serialized rows.
48-
/// @param data The start memory address of each row.
49-
static RowVectorPtr deserialize(
50-
const std::vector<char*>& data,
51-
const RowTypePtr& rowType,
52-
memory::MemoryPool* pool);
53-
5446
protected:
5547
explicit UnsafeRowFast(const VectorPtr& vector);
5648

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <folly/Benchmark.h>
18+
#include <folly/init/Init.h>
19+
#include <random>
20+
21+
#include "velox/row/UnsafeRowDeserializers.h"
22+
#include "velox/row/UnsafeRowFast.h"
23+
#include "velox/type/Type.h"
24+
#include "velox/vector/fuzzer/VectorFuzzer.h"
25+
#include "velox/vector/tests/utils/VectorMaker.h"
26+
27+
namespace facebook::spark::benchmarks {
28+
namespace {
29+
using namespace facebook::velox;
30+
using namespace facebook::velox::row;
31+
using facebook::velox::test::VectorMaker;
32+
33+
class Deserializer {
34+
public:
35+
virtual ~Deserializer() = default;
36+
virtual void deserialize(
37+
const std::vector<std::optional<std::string_view>>& data,
38+
const TypePtr& type) = 0;
39+
};
40+
41+
class UnsaferowBatchDeserializer : public Deserializer {
42+
public:
43+
UnsaferowBatchDeserializer() {}
44+
45+
void deserialize(
46+
const std::vector<std::optional<std::string_view>>& data,
47+
const TypePtr& type) override {
48+
UnsafeRowDeserializer::deserialize(data, type, pool_.get());
49+
}
50+
51+
private:
52+
std::shared_ptr<memory::MemoryPool> pool_{
53+
memory::memoryManager()->addLeafPool()};
54+
};
55+
56+
class BenchmarkHelper {
57+
public:
58+
std::tuple<std::vector<std::optional<std::string_view>>, TypePtr>
59+
randomUnsaferows(int nFields, int nRows, bool stringOnly) {
60+
RowTypePtr rowType;
61+
std::vector<std::string> names;
62+
std::vector<TypePtr> types;
63+
names.reserve(nFields);
64+
types.reserve(nFields);
65+
for (int32_t i = 0; i < nFields; ++i) {
66+
names.push_back("");
67+
if (stringOnly) {
68+
types.push_back(VARCHAR());
69+
} else {
70+
auto idx = folly::Random::rand32() % allTypes_.size();
71+
types.push_back(allTypes_[idx]);
72+
}
73+
}
74+
rowType =
75+
TypeFactory<TypeKind::ROW>::create(std::move(names), std::move(types));
76+
77+
VectorFuzzer::Options opts;
78+
opts.vectorSize = 1;
79+
opts.nullRatio = 0.1;
80+
opts.stringVariableLength = true;
81+
opts.stringLength = 20;
82+
// Spark uses microseconds to store timestamp
83+
opts.timestampPrecision =
84+
VectorFuzzer::Options::TimestampPrecision::kMicroSeconds;
85+
86+
auto seed = folly::Random::rand32();
87+
VectorFuzzer fuzzer(opts, pool_.get(), seed);
88+
const auto inputVector = fuzzer.fuzzInputRow(rowType);
89+
std::vector<std::optional<std::string_view>> results;
90+
results.reserve(nRows);
91+
// Serialize rowVector into bytes.
92+
UnsafeRowFast unsafeRow(inputVector);
93+
for (int32_t i = 0; i < nRows; ++i) {
94+
BufferPtr bufferPtr =
95+
AlignedBuffer::allocate<char>(1024, pool_.get(), true);
96+
char* buffer = bufferPtr->asMutable<char>();
97+
auto rowSize = unsafeRow.serialize(0, buffer);
98+
results.push_back(std::string_view(buffer, rowSize));
99+
}
100+
return {results, rowType};
101+
}
102+
103+
private:
104+
std::vector<TypePtr> allTypes_{
105+
BOOLEAN(),
106+
TINYINT(),
107+
SMALLINT(),
108+
INTEGER(),
109+
BIGINT(),
110+
REAL(),
111+
DOUBLE(),
112+
VARCHAR(),
113+
TIMESTAMP(),
114+
ARRAY(INTEGER()),
115+
MAP(VARCHAR(), ARRAY(INTEGER())),
116+
ROW({INTEGER()})};
117+
118+
std::shared_ptr<memory::MemoryPool> pool_{
119+
memory::memoryManager()->addLeafPool()};
120+
};
121+
122+
int deserialize(
123+
int nIters,
124+
int nFields,
125+
int nRows,
126+
bool variable,
127+
std::unique_ptr<Deserializer> deserializer) {
128+
folly::BenchmarkSuspender suspender;
129+
BenchmarkHelper helper;
130+
auto [data, rowType] = helper.randomUnsaferows(nFields, nRows, variable);
131+
suspender.dismiss();
132+
133+
for (int i = 0; i < nIters; i++) {
134+
deserializer->deserialize(data, rowType);
135+
}
136+
137+
return nIters * nFields * nRows;
138+
}
139+
140+
BENCHMARK_NAMED_PARAM_MULTI(
141+
deserialize,
142+
batch_10_100k_string_only,
143+
10,
144+
100000,
145+
true,
146+
std::make_unique<UnsaferowBatchDeserializer>());
147+
148+
BENCHMARK_NAMED_PARAM_MULTI(
149+
deserialize,
150+
batch_100_100k_string_only,
151+
100,
152+
100000,
153+
true,
154+
std::make_unique<UnsaferowBatchDeserializer>());
155+
156+
BENCHMARK_NAMED_PARAM_MULTI(
157+
deserialize,
158+
batch_10_100k_all_types,
159+
10,
160+
100000,
161+
false,
162+
std::make_unique<UnsaferowBatchDeserializer>());
163+
164+
BENCHMARK_NAMED_PARAM_MULTI(
165+
deserialize,
166+
batch_100_100k_all_types,
167+
100,
168+
100000,
169+
false,
170+
std::make_unique<UnsaferowBatchDeserializer>());
171+
172+
} // namespace
173+
} // namespace facebook::spark::benchmarks
174+
175+
int main(int argc, char** argv) {
176+
folly::Init init{&argc, &argv};
177+
facebook::velox::memory::MemoryManager::initialize({});
178+
folly::runBenchmarks();
179+
return 0;
180+
}

velox/row/benchmarks/UnsafeRowSerializeBenchmark.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "velox/common/memory/HashStringAllocator.h"
2020
#include "velox/exec/ContainerRowSerde.h"
2121
#include "velox/row/CompactRow.h"
22+
#include "velox/row/UnsafeRowDeserializers.h"
2223
#include "velox/row/UnsafeRowFast.h"
2324
#include "velox/vector/fuzzer/VectorFuzzer.h"
2425

@@ -48,7 +49,7 @@ class SerializeBenchmark {
4849
auto serialized = serialize(fast, data->size(), buffer);
4950
suspender.dismiss();
5051

51-
auto copy = UnsafeRowFast::deserialize(serialized, rowType, pool());
52+
auto copy = UnsafeRowDeserializer::deserialize(serialized, rowType, pool());
5253
VELOX_CHECK_EQ(copy->size(), data->size());
5354
}
5455

@@ -144,17 +145,17 @@ class SerializeBenchmark {
144145
return totalSize;
145146
}
146147

147-
std::vector<char*> serialize(
148+
std::vector<std::optional<std::string_view>> serialize(
148149
UnsafeRowFast& unsafeRow,
149150
vector_size_t numRows,
150151
BufferPtr& buffer) {
151-
std::vector<char*> serialized;
152+
std::vector<std::optional<std::string_view>> serialized;
152153
auto rawBuffer = buffer->asMutable<char>();
153154

154155
size_t offset = 0;
155156
for (auto i = 0; i < numRows; ++i) {
156157
auto rowSize = unsafeRow.serialize(i, rawBuffer + offset);
157-
serialized.push_back(rawBuffer + offset);
158+
serialized.push_back(std::string_view(rawBuffer + offset, rowSize));
158159
offset += rowSize;
159160
}
160161

velox/row/tests/UnsafeRowFuzzTest.cpp

Lines changed: 11 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ class UnsafeRowFuzzTests : public ::testing::Test {
4545
}
4646
}
4747

48-
template <typename T>
4948
void doTest(
5049
const RowTypePtr& rowType,
51-
std::function<std::vector<T>(const RowVectorPtr& data)> serializeFunc) {
50+
std::function<std::vector<std::optional<std::string_view>>(
51+
const RowVectorPtr& data)> serializeFunc) {
5252
VectorFuzzer::Options opts;
5353
opts.vectorSize = kNumBuffers;
5454
opts.nullRatio = 0.1;
@@ -79,18 +79,12 @@ class UnsafeRowFuzzTests : public ::testing::Test {
7979

8080
// Serialize rowVector into bytes.
8181
auto serialized = serializeFunc(inputVector);
82-
if constexpr (std::is_same_v<T, std::optional<std::string_view>>) {
83-
// Deserialize previous bytes back to row vector
84-
VectorPtr outputVector = UnsafeRowDeserializer::deserialize(
85-
serialized, rowType, pool_.get());
86-
87-
assertEqualVectors(inputVector, outputVector);
88-
} else {
89-
VectorPtr outputVector =
90-
UnsafeRowFast::deserialize(serialized, rowType, pool_.get());
91-
92-
assertEqualVectors(inputVector, outputVector);
93-
}
82+
83+
// Deserialize previous bytes back to row vector
84+
VectorPtr outputVector =
85+
UnsafeRowDeserializer::deserialize(serialized, rowType, pool_.get());
86+
87+
assertEqualVectors(inputVector, outputVector);
9488
}
9589
}
9690

@@ -168,9 +162,9 @@ TEST_F(UnsafeRowFuzzTests, fast) {
168162
MAP(BIGINT(), ROW({BOOLEAN(), TINYINT(), REAL()})),
169163
});
170164

171-
doTest<char*>(rowType, [&](const RowVectorPtr& data) {
165+
doTest(rowType, [&](const RowVectorPtr& data) {
172166
const auto numRows = data->size();
173-
std::vector<char*> serialized;
167+
std::vector<std::optional<std::string_view>> serialized;
174168
serialized.reserve(numRows);
175169

176170
UnsafeRowFast fast(data);
@@ -195,44 +189,10 @@ TEST_F(UnsafeRowFuzzTests, fast) {
195189

196190
EXPECT_EQ(rowSize, fast.rowSize(i)) << i << ", " << data->toString(i);
197191

198-
serialized.push_back(buffers_[i]);
192+
serialized.push_back(std::string_view(buffers_[i], rowSize));
199193
}
200194
return serialized;
201195
});
202-
203-
doTest<std::optional<std::string_view>>(
204-
rowType, [&](const RowVectorPtr& data) {
205-
const auto numRows = data->size();
206-
std::vector<std::optional<std::string_view>> serialized;
207-
serialized.reserve(numRows);
208-
209-
UnsafeRowFast fast(data);
210-
211-
std::vector<vector_size_t> rows(numRows);
212-
std::iota(rows.begin(), rows.end(), 0);
213-
std::vector<vector_size_t> serializedRowSizes(numRows);
214-
std::vector<vector_size_t*> serializedRowSizesPtr(numRows);
215-
for (auto i = 0; i < numRows; ++i) {
216-
serializedRowSizesPtr[i] = &serializedRowSizes[i];
217-
}
218-
fast.serializedRowSizes(
219-
folly::Range(rows.data(), numRows), serializedRowSizesPtr.data());
220-
for (auto i = 0; i < numRows; ++i) {
221-
// The serialized row includes the size of the row.
222-
VELOX_CHECK_EQ(
223-
serializedRowSizes[i], fast.rowSize(i) + sizeof(uint32_t));
224-
}
225-
226-
for (auto i = 0; i < data->size(); ++i) {
227-
auto rowSize = fast.serialize(i, buffers_[i]);
228-
VELOX_CHECK_LE(rowSize, kBufferSize);
229-
230-
EXPECT_EQ(rowSize, fast.rowSize(i)) << i << ", " << data->toString(i);
231-
232-
serialized.push_back(std::string_view(buffers_[i], rowSize));
233-
}
234-
return serialized;
235-
});
236196
}
237197

238198
} // namespace

velox/serializers/RowSerializer.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -414,13 +414,9 @@ class RowDeserializer {
414414
uncompressedSource, header.uncompressedSize + initialSize);
415415
while (rowIterator.hasNext()) {
416416
serializedBuffers.emplace_back(std::move(rowIterator.next()));
417-
if constexpr (std::is_same_v<SerializeView, std::string_view>) {
418-
serializedRows.push_back(std::string_view(
419-
serializedBuffers.back()->data(),
420-
serializedBuffers.back()->size()));
421-
} else {
422-
serializedRows.push_back(serializedBuffers.back()->data());
423-
}
417+
serializedRows.push_back(std::string_view(
418+
serializedBuffers.back()->data(),
419+
serializedBuffers.back()->size()));
424420
}
425421
}
426422
}

velox/serializers/UnsafeRowSerializer.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
#include "velox/serializers/UnsafeRowSerializer.h"
1717
#include <folly/lang/Bits.h>
18+
#include "velox/row/UnsafeRowDeserializers.h"
1819
#include "velox/row/UnsafeRowFast.h"
1920
#include "velox/serializers/RowSerializer.h"
2021

@@ -43,18 +44,19 @@ void UnsafeRowVectorSerde::deserialize(
4344
RowTypePtr type,
4445
RowVectorPtr* result,
4546
const Options* options) {
46-
std::vector<char*> serializedRows;
47+
std::vector<std::optional<std::string_view>> serializedRows;
4748
std::vector<std::unique_ptr<std::string>> serializedBuffers;
48-
RowDeserializer<char*>::deserialize<RowIteratorImpl>(
49-
source, serializedRows, serializedBuffers, options);
49+
RowDeserializer<std::optional<std::string_view>>::deserialize<
50+
RowIteratorImpl>(source, serializedRows, serializedBuffers, options);
5051

5152
if (serializedRows.empty()) {
5253
*result = BaseVector::create<RowVector>(type, 0, pool);
5354
return;
5455
}
5556

5657
*result = std::dynamic_pointer_cast<RowVector>(
57-
row::UnsafeRowFast::deserialize(serializedRows, type, pool));
58+
velox::row::UnsafeRowDeserializer::deserialize(
59+
serializedRows, type, pool));
5860
}
5961

6062
// static

0 commit comments

Comments
 (0)